def testModelBySame(blockwordsList, blockfromList): #判断是否为垃圾邮件 words_blocklist, from_blocklist = getBlockedWords() white_list = getWhitelist() df = getMails() dfForEvaluate = df[df['type'] == 2] #获取待分类邮件信息,以pandas表格存储 dfSafe = df[df['type'] == 0] #含附件的邮件默认为正常邮件 wordsStr = list(dfForEvaluate["content"].astype("str")) fromStr = list(dfForEvaluate["from"].astype("str")) titleStr = list(dfForEvaluate["title"].astype("str")) i = -1 #记录读取的序号 for myword in fromStr: #如在白名单中,直接判为正常邮件。在黑名单中,直接判为垃圾邮件。 i = i + 1 for word in myword.strip().split(","): if (word in white_list): dfForEvaluate.ix[i, 'blocked'] = 0 dfForEvaluate.ix[i, 'type'] = 0 elif (word in from_blocklist): dfForEvaluate.ix[i, 'blocked'] = 1 dfForEvaluate.ix[i, 'type'] = 1 else: dfForEvaluate.ix[i, 'blocked'] = 2 j = -1 #记录读取的序号 for myword2 in titleStr: #如果标题中存在屏蔽词,直接判为垃圾邮件 j = j + 1 for eachword1 in words_blocklist: if (myword2.find(eachword1) != -1): dfForEvaluate.ix[j, 'blocked'] = 1 dfForEvaluate.ix[j, 'type'] = 1 k = -1 for myword1 in wordsStr: #如果内容中存在屏蔽词,直接判为垃圾邮件 k = k + 1 for eachword in words_blocklist: if (myword1.find(eachword) != -1): dfForEvaluate.ix[k, 'blocked'] = 1 dfForEvaluate.ix[k, 'type'] = 1 dfBlocked = dfForEvaluate[dfForEvaluate['blocked'] == 1] #被屏蔽的邮件 dfWhitelist = dfForEvaluate[dfForEvaluate['blocked'] == 0] #白名单邮件 dfLeft = dfForEvaluate[dfForEvaluate['blocked'] == 2] #除以上两者剩余邮件 transformer_model = joblib.load( "../data/result_save_TFM_try") #载入保存的模型进行预测 svd_model = joblib.load("../data/result_save_SVDM_try") model = joblib.load("../data/result_save_AdaBoost_try") jieba_cut_content = list(dfLeft["content"].astype("str")) jieba_cut_content = [jiebaclearText(line) for line in jieba_cut_content] y_test = dfLeft["type"] data_test = pd.DataFrame( svd_model.transform(transformer_model.transform(jieba_cut_content))) y_predict = model.predict(data_test) resultList = list(y_predict) #存放预测结果 resultList = [int(i) for i in resultList] dfLeft['type'] = resultList return dfWhitelist, dfSafe, dfLeft, dfBlocked
def testModelBySame(testFilePath): transformer_model = joblib.load("../data/result_save_TFM_try") svd_model = joblib.load("../data/result_save_SVDM_try") model = joblib.load("../data/result_save_AdaBoost_try") df = pd.read_csv(testFilePath,names = ['frome','to','title','content','classes'] ,encoding="utf-8",sep=",") #df.dropna(axis=0,how="any",inplace=True) jieba_cut_content = list(df["content"].astype("str")) jieba_cut_content = [jiebaclearText(line) for line in jieba_cut_content] data_test = pd.DataFrame(svd_model.transform(transformer_model.transform(jieba_cut_content))) y_predict = model.predict(data_test) resultList = list(y_predict) resultList = [int(i) for i in resultList] df['classes'] = resultList return df
def process_file(file_path): content_dict = read_file(file_path) #进行处理(拼接),get()函数返回指定键的值,指定键的值不存在用指定的默认值unkown代替 result_str = content_dict.get("from", "unkown").replace(",", "").strip() + "," result_str += content_dict.get("to", "unkown").replace(",", "").strip() + "," result_str += content_dict.get("data", "unkown").replace(",", "").strip() + "," firstText = content_dict.get("content", "unkown").replace(",", "").strip() firstText = firstText.replace("+", "").strip() firstText = firstText.replace("_", "").strip() result_str += jiebaclearText(firstText) return result_str
def testModelBySame(testFilePath): #读取存储下来的训练训练模型 transformer_model = joblib.load("../data/result_save_TFM_try") svd_model = joblib.load("../data/result_save_SVDM_try") model = joblib.load("../data/result_save_AdaBoost_try") #预处理数据 df = pd.read_csv(testFilePath,names = ['frome','to','title','content','classes'] ,encoding="utf-8",sep=",") jieba_cut_content = list(df["content"].astype("str")) jieba_cut_content = [jiebaclearText(line) for line in jieba_cut_content] data_test = pd.DataFrame(svd_model.transform(transformer_model.transform(jieba_cut_content))) #进行判断 y_predict = model.predict(data_test) resultList = list(y_predict) #返回结果的数组 resultList = [int(i) for i in resultList] df['classes'] = resultList return df
def testModelBySame(blockwordsList, blockfromList): #判断是否为垃圾邮件 words_blocklist, from_blocklist = getBlockedWords() white_list = getWhitelist() df = getMails() dfForEvaluate = df[df['type'] == 2] #print("dfForEvaluate") #print(dfForEvaluate) dfSafe = df[df['type'] == 0] #print("dfSafe") #print(dfSafe) wordsStr = list(dfForEvaluate["content"].astype("str")) fromStr = list(dfForEvaluate["from"].astype("str")) titleStr = list(dfForEvaluate["title"].astype("str")) # print(titleStr) i = -1 for myword in fromStr: #如在白名单中,直接判为正常邮件。在黑名单中,直接判为垃圾邮件。 i = i + 1 for word in myword.strip().split(","): if (word in white_list): dfForEvaluate.ix[i, 'blocked'] = 0 dfForEvaluate.ix[i, 'type'] = 0 elif (word in from_blocklist): dfForEvaluate.ix[i, 'blocked'] = 1 dfForEvaluate.ix[i, 'type'] = 1 else: dfForEvaluate.ix[i, 'blocked'] = 2 #print(wordsStr) j = -1 for myword2 in titleStr: j = j + 1 for eachword1 in words_blocklist: if (myword2.find(eachword1) != -1): # print(myword2) dfForEvaluate.ix[j, 'blocked'] = 1 dfForEvaluate.ix[j, 'type'] = 1 k = -1 for myword1 in wordsStr: k = k + 1 for eachword in words_blocklist: if (myword1.find(eachword) != -1): # print(myword1) dfForEvaluate.ix[k, 'blocked'] = 1 dfForEvaluate.ix[k, 'type'] = 1 #print(dfForEvaluate) dfBlocked = dfForEvaluate[dfForEvaluate['blocked'] == 1] dfWhitelist = dfForEvaluate[dfForEvaluate['blocked'] == 0] dfLeft = dfForEvaluate[dfForEvaluate['blocked'] == 2] transformer_model = joblib.load("../data/result_save_TFM_try") svd_model = joblib.load("../data/result_save_SVDM_try") model = joblib.load("../data/result_save_AdaBoost_try") #print(model) jieba_cut_content = list(dfLeft["content"].astype("str")) jieba_cut_content = [jiebaclearText(line) for line in jieba_cut_content] #print(jieba_cut_content) #print(testList) y_test = dfLeft["type"] data_test = pd.DataFrame( svd_model.transform(transformer_model.transform(jieba_cut_content))) y_predict = model.predict(data_test) resultList = list(y_predict) resultList = [int(i) for i in resultList] dfLeft['type'] = resultList print(resultList) return dfWhitelist, dfSafe, dfLeft, dfBlocked