def predict_webclass(webdata): X_train_text = [] tmp_data = "" for data in webdata['webtext']: tmp_data = tmp_data + data len_webtext = len(tmp_data) rule = re.compile(u"[^\u4E00-\u9FA5]") len_chinese = len(rule.sub('', tmp_data)) if len_chinese / len_webtext < 0.5: return "外语网站" if len(webdata['webtext']) >= 15: X_train_text.append(mytool.get_all_webdata(webdata)) else: return "数据过少" # 将文本转为张量 # X_train 训练数据 X_train = [] for sentence in X_train_text: tmp_words = mytool.seg_sentence(sentence, stopwordslist) X_train.append(words2index(tmp_words)) # 3 机器学习训练 model_max_len = 300 x_train_raw = pad_sequences(X_train, maxlen=model_max_len) predicted = LSTM_model.predict(x_train_raw) predicted = class_list[np.argmax(predicted)] return predicted
def find_weburldata(weburl): path = "/home/jiangy2/webdata/" fs = os.listdir(path) for subpath in fs: filepath = os.path.join(path, subpath) if (os.path.isdir(filepath)): webdata_path = os.listdir(filepath) for filename in webdata_path: fileurl = filename.replace(".txt", "") fileurl = fileurl.replace("www.", "") stripfilename = weburl.replace("www.", "") if fileurl == stripfilename: webdata = mytool.read_webdata( os.path.join(filepath, filename)) if webdata['title'] != "" and webdata[ 'description'] != "" and webdata['keywords'] != "": if len(webdata['webtext']) >= 15: return mytool.get_all_webdata(webdata) newpath = "/home/jiangy2/dnswork/newwebdata/" newfs = os.listdir(newpath) for subpath in newfs: filepath = os.path.join(newpath, subpath) if (os.path.isdir(filepath)): webdata_path = os.listdir(filepath) for filename in webdata_path: fileurl = filename.replace(".txt", "") fileurl = fileurl.replace("www.", "") stripfilename = weburl.replace("www.", "") if fileurl == stripfilename: webdata = mytool.read_webdata( os.path.join(filepath, filename)) if webdata['title'] != "" and webdata[ 'description'] != "" and webdata['keywords'] != "": if len(webdata['webtext']) >= 15: return mytool.get_all_webdata(webdata) return ""
# 遍历路径下所有的文件夹 for d in dirs: dirlist.append(os.path.join(root, d)) # print(os.path.join(root, d)) ## 读取疑似的网页数据 webdata_list = [] for dirpath in dirlist: print(dirpath) for root, dirs, files in os.walk(dirpath): for f in files: if f.replace(".txt","") in filenamelist: ### data = mytool.read_webdata(os.path.join(root, f)) # print(os.path.join(root, f)) # 网页数据存入一个list target_data = mytool.get_all_webdata(data) #分词 tmp_words = mytool.seg_sentence(target_data, stopwordslist) webdata_list.append(tmp_words) print(webdata_list[0]) print("读取疑似网页内容共:" , len(webdata_list)) #构建词频矩阵,训练LDA模型 dictionary = corpora.Dictionary(webdata_list) # corpus[0]: [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1),...] # corpus是把每条新闻ID化后的结果,每个元素是新闻中的每个词语,在字典中的ID和频率 corpus = [dictionary.doc2bow(text) for text in webdata_list] lda = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=3)
j = 0 for subpath in fs: filepath = os.path.join(path, subpath) # print(filepath) if (os.path.isdir(filepath)): webdata_classtype = classtype[subpath] # 查询父类别 webdata_class_index = class_index[webdata_classtype] #父类别下标 webdata_path = os.listdir(filepath) for filename in webdata_path: i = i + 1 webdata = mytool.read_webdata(os.path.join(filepath, filename)) if webdata['title'] != "" and webdata[ 'description'] != "" and webdata['keywords'] != "": if len(webdata['webtext']) >= 15: j = j + 1 X_train_text.append(mytool.get_all_webdata(webdata)) Y_train.append(webdata_class_index) print("已爬取网页数:") print(i) print("有效网页数:") print(j) # 2.4 将文本转为张量 # X_train 训练数据 X_train = [] # 将单词转为词向量的下标,下标从1开始 返回下标的list def words2index(words): index_list = []