Exemple #1
0
def predict_webclass(webdata):
    X_train_text = []
    tmp_data = ""
    for data in webdata['webtext']:
        tmp_data = tmp_data + data
    len_webtext = len(tmp_data)
    rule = re.compile(u"[^\u4E00-\u9FA5]")
    len_chinese = len(rule.sub('', tmp_data))
    if len_chinese / len_webtext < 0.5:
        return "外语网站"
    if len(webdata['webtext']) >= 15:
        X_train_text.append(mytool.get_all_webdata(webdata))
    else:
        return "数据过少"
    #  将文本转为张量
    # X_train 训练数据
    X_train = []
    for sentence in X_train_text:
        tmp_words = mytool.seg_sentence(sentence, stopwordslist)
        X_train.append(words2index(tmp_words))
    # 3 机器学习训练
    model_max_len = 300
    x_train_raw = pad_sequences(X_train, maxlen=model_max_len)
    predicted = LSTM_model.predict(x_train_raw)
    predicted = class_list[np.argmax(predicted)]
    return predicted
Exemple #2
0
# 2.4 将文本转为张量
# X_train 训练数据
X_train = []


# 将单词转为词向量的下标,下标从1开始 返回下标的list
def words2index(words):
    index_list = []
    for word in words:
        if word in embeddings_index.keys():  # 单词是否在词向量中
            index_list.append(embeddings_index[word])
    return index_list


for sentence in X_train_text:
    tmp_words = mytool.seg_sentence(sentence, stopwordslist)
    X_train.append(words2index(tmp_words))

# 3 机器学习训练
model_max_len = 350


# 3.1 定义模型
def get_lstm_model():
    model = Sequential()
    model.add(
        Embedding(
            input_dim=EMBEDDING_length + 1,
            output_dim=EMBEDDING_DIM,
            weights=[embedding_matrix],
            # input_length=200,
Exemple #3
0
            dirlist.append(os.path.join(root, d))
            # print(os.path.join(root, d))

## 读取疑似的网页数据
webdata_list = []
for dirpath in dirlist:
    print(dirpath)
    for root, dirs, files in os.walk(dirpath):
        for f in files:
            if f.replace(".txt","") in filenamelist: ###
                data = mytool.read_webdata(os.path.join(root, f))
                # print(os.path.join(root, f))
                # 网页数据存入一个list
                target_data = mytool.get_all_webdata(data)
                #分词
                tmp_words = mytool.seg_sentence(target_data, stopwordslist)
                webdata_list.append(tmp_words)

print(webdata_list[0])
print("读取疑似网页内容共:" , len(webdata_list))


#构建词频矩阵,训练LDA模型
dictionary = corpora.Dictionary(webdata_list)
# corpus[0]: [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1),...]
# corpus是把每条新闻ID化后的结果,每个元素是新闻中的每个词语,在字典中的ID和频率
corpus = [dictionary.doc2bow(text) for text in webdata_list]

lda = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=3)
topic_list = lda.print_topics(3)
print("3个主题的单词分布为:\n")