def predict_webclass(webdata): X_train_text = [] tmp_data = "" for data in webdata['webtext']: tmp_data = tmp_data + data len_webtext = len(tmp_data) rule = re.compile(u"[^\u4E00-\u9FA5]") len_chinese = len(rule.sub('', tmp_data)) if len_chinese / len_webtext < 0.5: return "外语网站" if len(webdata['webtext']) >= 15: X_train_text.append(mytool.get_all_webdata(webdata)) else: return "数据过少" # 将文本转为张量 # X_train 训练数据 X_train = [] for sentence in X_train_text: tmp_words = mytool.seg_sentence(sentence, stopwordslist) X_train.append(words2index(tmp_words)) # 3 机器学习训练 model_max_len = 300 x_train_raw = pad_sequences(X_train, maxlen=model_max_len) predicted = LSTM_model.predict(x_train_raw) predicted = class_list[np.argmax(predicted)] return predicted
# 2.4 将文本转为张量 # X_train 训练数据 X_train = [] # 将单词转为词向量的下标,下标从1开始 返回下标的list def words2index(words): index_list = [] for word in words: if word in embeddings_index.keys(): # 单词是否在词向量中 index_list.append(embeddings_index[word]) return index_list for sentence in X_train_text: tmp_words = mytool.seg_sentence(sentence, stopwordslist) X_train.append(words2index(tmp_words)) # 3 机器学习训练 model_max_len = 350 # 3.1 定义模型 def get_lstm_model(): model = Sequential() model.add( Embedding( input_dim=EMBEDDING_length + 1, output_dim=EMBEDDING_DIM, weights=[embedding_matrix], # input_length=200,
dirlist.append(os.path.join(root, d)) # print(os.path.join(root, d)) ## 读取疑似的网页数据 webdata_list = [] for dirpath in dirlist: print(dirpath) for root, dirs, files in os.walk(dirpath): for f in files: if f.replace(".txt","") in filenamelist: ### data = mytool.read_webdata(os.path.join(root, f)) # print(os.path.join(root, f)) # 网页数据存入一个list target_data = mytool.get_all_webdata(data) #分词 tmp_words = mytool.seg_sentence(target_data, stopwordslist) webdata_list.append(tmp_words) print(webdata_list[0]) print("读取疑似网页内容共:" , len(webdata_list)) #构建词频矩阵,训练LDA模型 dictionary = corpora.Dictionary(webdata_list) # corpus[0]: [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1),...] # corpus是把每条新闻ID化后的结果,每个元素是新闻中的每个词语,在字典中的ID和频率 corpus = [dictionary.doc2bow(text) for text in webdata_list] lda = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=3) topic_list = lda.print_topics(3) print("3个主题的单词分布为:\n")