def copy_Data_from_Raw_to_Result():

    # 获得配置文件参数
    MongoDB_Host = mod_config.getConfig("database", "db_Host")
    MongoDB_Port = mod_config.getConfig("database", "db_Port")
    UniversityList = mod_config.get_University_list()

    #建立数据库连接
    conn = MongoClient(MongoDB_Host, int(MongoDB_Port))
    RawPOA = conn.RawPOA
    ResultPOA = conn.ResultPOA

    CopyList = []
    tempDict = {}

    for iterm in RawPOA["news"].find():

        tempDict = iterm
        tempDict["classification"] = "undefined"
        tempDict["sentiment"] = "99"
        CopyList.append(tempDict)
        print tempDict

    ResultPOA["news"].insert(CopyList)
    print "复制成功"
Exemple #2
0
def spider():

    UniversityList = mod_config.get_University_list()
    resultList = {}

    #初始化新闻集合的数据结构
    for uni in UniversityList:
        resultList[uni["zh_name"]] = []

    # 请求地址模板
    base_url = "http://news.baidu.com/ns?word={school}&pn=0&rn=20&cl=2"

    # 设置用户代理
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
    headers = {'User-Agent': user_agent}

    #开始爬取20个学校的最近新闻
    for uni in UniversityList:
        url = base_url.format(school=uni["zh_name"])
        print "开始爬取" + uni["zh_name"] + "..."
        time.sleep(2)

        try:
            request = urllib2.Request(url, headers=headers)
            html = urllib2.urlopen(request).read()
            re_result = r'<h3 class="c-title">(.*?)<span class="c-info">'
            re_href = r'<a href="(.*?)"'
            re_date = r'<p class="c-author">.*?&nbsp;&nbsp;(.*?)</p>'
            result = re.findall(re_result, html, re.S | re.M)

            for detail in result:
                href = re.findall(re_href, detail, re.S | re.M)[0]
                try:
                    date = re.findall(re_date, detail, re.S | re.M)[0]
                except Exception, e:
                    date = "暂无日期"
                    print e

                time.sleep(1)
                try:
                    request = urllib2.Request(href, headers=headers)
                    html = urllib2.urlopen(request, timeout=5).read()
                    parseHTMLResult = process_BodyText(html)
                    Uname = uni["zh_name"]
                    abbr = uni["en_name"]
                    document = {
                        "Uname": Uname,
                        "abbr": abbr,
                        "title": parseHTMLResult["title"],
                        "url": href,
                        "date": date,
                        "body": parseHTMLResult["body"]
                    }
                    if filter(document) == "true":
                        resultList[uni["zh_name"]].append(document)
                except Exception, e:
                    print e
def insert_university_list():
    # 获得配置文件参数
    MongoDB_Host = mod_config.getConfig("database", "db_Host")
    MongoDB_Port = mod_config.getConfig("database", "db_Port")
    UniversityList = mod_config.get_University_list()

    #建立数据库连接
    conn = MongoClient(MongoDB_Host, int(MongoDB_Port))
    ResultPOA = conn.ResultPOA

    ResultPOA["universitylist"].insert(UniversityList)
    print "插入成功"
def main():

    # 获得配置文件参数
    MongoDB_Host = mod_config.getConfig("database", "db_Host")
    MongoDB_Port = mod_config.getConfig("database", "db_Port")
    UniversityList = mod_config.get_University_list()

    Dict = mod_config.get_ModelType_Info()

    #建立数据库连接
    conn = MongoClient(MongoDB_Host, int(MongoDB_Port))
    ResultPOA = conn.ResultPOA
    RawPOA = conn.RawPOA
    #LocalResultPOA = MongoClient('192.168.1.3',27017).ResultPOA

    # for uni in UniversityList:
    #
    #     InsertList = []
    #     for iterm in RawPOA["news"].find({"Uname": uni["zh_name"]}):
    #
    #         text_seg = seg_ChineseText(iterm["body"])
    #
    #         tfidf_sentiment = get_tfidf(text_seg,Dict["sentiment"])
    #         tfidf_classification = get_tfidf(text_seg,Dict["classification"])
    #
    #         sentimentResult = get_predictedResult(tfidf_sentiment,Dict["sentiment"])
    #         classificationResult = get_predictedResult(tfidf_classification,Dict["classification"])
    #
    #         newsDoc = iterm
    #         newsDoc["sentiment"] = sentimentResult
    #         newsDoc["classification"] = classificationResult
    #         InsertList.append(newsDoc)
    #
    #     ResultPOA["news"].insert(InsertList)
    #     #LocalResultPOA["news"].insert(InsertList)
    #     print uni["zh_name"] + "的新闻信息已保存完毕"

    str = "上海海事大学2017年拟录取硕士研究生名单公示"

    text_seg = seg_ChineseText(str)

    tfidf_sentiment = get_tfidf(text_seg, Dict["sentiment"])
    tfidf_classification = get_tfidf(text_seg, Dict["classification"])

    sentimentResult = get_predictedResult(tfidf_sentiment, Dict["sentiment"])
    classificationResult = get_predictedResult(tfidf_classification,
                                               Dict["classification"])

    print "待分类预测文本:'{0}' 的分类结果为:\n 情感分类:{1}\n 类别分类:{2}\n".format(
        str, sentimentResult, classificationResult)
def main():

    # 获得配置文件参数
    MongoDB_Host = mod_config.getConfig("database", "db_Host")
    MongoDB_Port = mod_config.getConfig("database", "db_Port")
    UniversityList = mod_config.get_University_list()

    #建立数据库连接
    conn = MongoClient(MongoDB_Host, int(MongoDB_Port))
    RawPOA = conn.RawPOA

    #计数器
    count = 0

    ISOTIMEFORMAT = '%Y-%m-%d %X'

    print time.strftime(ISOTIMEFORMAT, time.localtime())
    #print "/n"

    for uni in UniversityList:
        count += 1
        #print "开始爬取第"+str(count)+"个学校数据,还有"+str(len(UniversityList)-count)+"个学校爬取"
        print "The system begins to download the " + str(
            count) + "st university. There are " + str(
                len(UniversityList) - count) + " universities left."

        newsCollection = request_NewsInfo(uni)
        save_DataToDB(newsCollection, RawPOA)
        #print  + "的新闻爬取完毕。共"+str(len(newsCollection))+"条信息数据\n"
        print "The system has finished downloading the news of " + uni[
            "en_name"] + ". There are total " + str(
                len(newsCollection)) + " news.\n"

        print time.strftime(ISOTIMEFORMAT, time.localtime())
        #print "/n"

    print "Work finished!\n"
    print time.strftime(ISOTIMEFORMAT, time.localtime())
def create_TrainData():

    # 获得配置文件参数
    MongoDB_Host = mod_config.getConfig("database", "db_Host")
    MongoDB_Port = mod_config.getConfig("database", "db_Port")
    UniversityList = mod_config.get_University_list()

    #建立数据库连接
    conn = MongoClient(MongoDB_Host, int(MongoDB_Port))
    RawPOA = conn.RawPOA

    newsCollection = RawPOA["news"]

    for uni in UniversityList:

        path = './rawdata/' + uni["chn_name"] + "/"
        if not os.path.exists(path):
            os.makedirs(path)

        # 调取每个学校前50篇新闻作为训练数据集
        for iterm in newsCollection.find({"Uname": uni["chn_name"]}).limit(50):

            filePath = path + str(
                iterm["title"].encode("utf8")).strip() + ".txt"
            fileBody = iterm["body"]

            try:

                f = codecs.open(filePath, 'w', "utf-8")
                f.write(fileBody)
                print filePath + "  写入成功"

            except Exception, e:
                print e
                print filePath + "  写入失败"

            finally:
def create_NewsNumbersInfo():
    # 获得配置文件参数
    MongoDB_Host = mod_config.getConfig("database", "db_Host")
    MongoDB_Port = mod_config.getConfig("database", "db_Port")
    UniversityList = mod_config.get_University_list()

    #建立数据库连接
    conn = MongoClient(MongoDB_Host, int(MongoDB_Port))
    ResultPOA = conn.ResultPOA
    ResultList = []

    for uni in UniversityList:

        studyNumberList = []
        activityNumberList = []
        entranceNumberList = []
        socialNumberList = []

        studyNumberList.append(ResultPOA["news"].find({
            "Uname": uni["zh_name"],
            "classification": "study",
            "sentiment": "-1"
        }).count())
        studyNumberList.append(ResultPOA["news"].find({
            "Uname": uni["zh_name"],
            "classification": "study",
            "sentiment": "0"
        }).count())
        studyNumberList.append(ResultPOA["news"].find({
            "Uname": uni["zh_name"],
            "classification": "study",
            "sentiment": "1"
        }).count())
        studyNumberList.append(studyNumberList[0] + studyNumberList[1] +
                               studyNumberList[2])

        activityNumberList.append(ResultPOA["news"].find({
            "Uname":
            uni["zh_name"],
            "classification":
            "activity",
            "sentiment":
            "-1"
        }).count())
        activityNumberList.append(ResultPOA["news"].find({
            "Uname":
            uni["zh_name"],
            "classification":
            "activity",
            "sentiment":
            "0"
        }).count())
        activityNumberList.append(ResultPOA["news"].find({
            "Uname":
            uni["zh_name"],
            "classification":
            "activity",
            "sentiment":
            "1"
        }).count())
        activityNumberList.append(activityNumberList[0] +
                                  activityNumberList[1] +
                                  activityNumberList[2])

        entranceNumberList.append(ResultPOA["news"].find({
            "Uname":
            uni["zh_name"],
            "classification":
            "entrance",
            "sentiment":
            "-1"
        }).count())
        entranceNumberList.append(ResultPOA["news"].find({
            "Uname":
            uni["zh_name"],
            "classification":
            "entrance",
            "sentiment":
            "0"
        }).count())
        entranceNumberList.append(ResultPOA["news"].find({
            "Uname":
            uni["zh_name"],
            "classification":
            "entrance",
            "sentiment":
            "1"
        }).count())
        entranceNumberList.append(entranceNumberList[0] +
                                  entranceNumberList[1] +
                                  entranceNumberList[2])

        socialNumberList.append(ResultPOA["news"].find({
            "Uname": uni["zh_name"],
            "classification": "social",
            "sentiment": "-1"
        }).count())
        socialNumberList.append(ResultPOA["news"].find({
            "Uname": uni["zh_name"],
            "classification": "social",
            "sentiment": "0"
        }).count())
        socialNumberList.append(ResultPOA["news"].find({
            "Uname": uni["zh_name"],
            "classification": "social",
            "sentiment": "1"
        }).count())
        socialNumberList.append(socialNumberList[0] + socialNumberList[1] +
                                socialNumberList[2])

        # print uni["zh_name"]
        # print studyNumberList
        # print activityNumberList
        # print entranceNumberList
        # print socialNumberList

        ResultList.append({
            "Uname": uni["zh_name"],
            "abbr": uni["en_name"],
            "studyNumber": studyNumberList,
            "activityNumber": activityNumberList,
            "entranceNumber": entranceNumberList,
            "socialNumber": socialNumberList
        })

    ResultPOA["newsNumber"].drop()
    ResultPOA["newsNumber"].insert(ResultList)
    print "保存成功"
Exemple #8
0
def mainTask():

    # 获得配置文件参数
    MongoDB_Host = mod_config.getConfig("database", "db_Host")
    MongoDB_Port = mod_config.getConfig("database", "db_Port")
    UniversityList = mod_config.get_University_list()

    #建立数据库连接
    conn = MongoClient(MongoDB_Host, int(MongoDB_Port))
    RawPOA = conn.RawPOA
    ResultPOA = conn.ResultPOA

    raw_tempList = {}
    result_tempList = {}

    for uni in UniversityList:
        raw_tempList[uni["zh_name"]] = []
        result_tempList[uni["zh_name"]] = []

    print "从缓存新闻表中取出数据..."
    for uni in UniversityList:
        for iterm in RawPOA["tempNews"].find({
                "Uname": uni["zh_name"]
        }).sort("date", pym.DESCENDING).limit(20):
            if iterm["date"].encode("utf-8") != "暂无日期":
                doc = {
                    "body": iterm["body"],
                    "title": iterm["title"],
                    "url": iterm["url"],
                    "Uname": iterm["Uname"],
                    "abbr": iterm["abbr"],
                    "date": iterm["date"]
                }
                raw_tempList[uni["zh_name"]].insert(0, doc)

        for iterm in ResultPOA["tempNews"].find({
                "Uname": uni["zh_name"]
        }).sort("date", pym.DESCENDING).limit(20):
            if iterm["date"].encode("utf-8") != "暂无日期":
                doc = {
                    "body": iterm["body"],
                    "title": iterm["title"],
                    "url": iterm["url"],
                    "Uname": iterm["Uname"],
                    "abbr": iterm["abbr"],
                    "date": iterm["date"],
                    "sentiment": iterm["sentiment"],
                    "classification": iterm["classification"]
                }
                result_tempList[uni["zh_name"]].insert(0, doc)

    print "从缓存新闻表中取出数据成功\n\n"

    spiderList = spider()

    print "开始查重去重操作..."

    for uni in UniversityList:
        count = 0
        predictedList = []
        for iterm in spiderList[uni["zh_name"]]:
            if (findUrlInList(iterm["url"], raw_tempList[uni["zh_name"]])
                    == -1) and (findUrlInList(
                        iterm["url"], result_tempList[uni["zh_name"]])):
                count = count + 1
                raw_tempList[uni["zh_name"]].insert(0, iterm)
                predictResult = predict(iterm["body"])
                tempIterm = {
                    "body": iterm["body"],
                    "title": iterm["title"],
                    "url": iterm["url"],
                    "Uname": iterm["Uname"],
                    "abbr": iterm["abbr"],
                    "date": iterm["date"],
                    "sentiment": predictResult["sentiment"],
                    "classification": predictResult["classification"]
                }
                result_tempList[uni["zh_name"]].insert(0, tempIterm)
                predictedList.append(tempIterm)

        for i in range(0, count):
            raw_tempList[uni["zh_name"]].pop()
            result_tempList[uni["zh_name"]].pop()

        ResultPOA["news"].insert(predictedList)
        print "结果数据表的数据保存成功"

    print "去重操作完成\n\n"

    RawPOA["tempNews"].drop()
    ResultPOA["tempNews"].drop()
    for uni in UniversityList:
        RawPOA["tempNews"].insert(raw_tempList[uni["zh_name"]])
        ResultPOA["tempNews"].insert(result_tempList[uni["zh_name"]])

    print "数据库存入数据成功\n\n"

    update_NewsNumbersInfo()