Exemple #1
0
def predict_webclass(webdata):
    X_train_text = []
    tmp_data = ""
    for data in webdata['webtext']:
        tmp_data = tmp_data + data
    len_webtext = len(tmp_data)
    rule = re.compile(u"[^\u4E00-\u9FA5]")
    len_chinese = len(rule.sub('', tmp_data))
    if len_chinese / len_webtext < 0.5:
        return "外语网站"
    if len(webdata['webtext']) >= 15:
        X_train_text.append(mytool.get_all_webdata(webdata))
    else:
        return "数据过少"
    #  将文本转为张量
    # X_train 训练数据
    X_train = []
    for sentence in X_train_text:
        tmp_words = mytool.seg_sentence(sentence, stopwordslist)
        X_train.append(words2index(tmp_words))
    # 3 机器学习训练
    model_max_len = 300
    x_train_raw = pad_sequences(X_train, maxlen=model_max_len)
    predicted = LSTM_model.predict(x_train_raw)
    predicted = class_list[np.argmax(predicted)]
    return predicted
Exemple #2
0
def find_weburldata(weburl):
    path = "/home/jiangy2/webdata/"
    fs = os.listdir(path)
    for subpath in fs:
        filepath = os.path.join(path, subpath)
        if (os.path.isdir(filepath)):
            webdata_path = os.listdir(filepath)
            for filename in webdata_path:
                fileurl = filename.replace(".txt", "")
                fileurl = fileurl.replace("www.", "")
                stripfilename = weburl.replace("www.", "")
                if fileurl == stripfilename:
                    webdata = mytool.read_webdata(
                        os.path.join(filepath, filename))
                    if webdata['title'] != "" and webdata[
                            'description'] != "" and webdata['keywords'] != "":
                        if len(webdata['webtext']) >= 15:
                            return mytool.get_all_webdata(webdata)
    newpath = "/home/jiangy2/dnswork/newwebdata/"
    newfs = os.listdir(newpath)
    for subpath in newfs:
        filepath = os.path.join(newpath, subpath)
        if (os.path.isdir(filepath)):
            webdata_path = os.listdir(filepath)
            for filename in webdata_path:
                fileurl = filename.replace(".txt", "")
                fileurl = fileurl.replace("www.", "")
                stripfilename = weburl.replace("www.", "")
                if fileurl == stripfilename:
                    webdata = mytool.read_webdata(
                        os.path.join(filepath, filename))
                    if webdata['title'] != "" and webdata[
                            'description'] != "" and webdata['keywords'] != "":
                        if len(webdata['webtext']) >= 15:
                            return mytool.get_all_webdata(webdata)
    return ""
Exemple #3
0
        # 遍历路径下所有的文件夹
        for d in dirs:
            dirlist.append(os.path.join(root, d))
            # print(os.path.join(root, d))

## 读取疑似的网页数据
webdata_list = []
for dirpath in dirlist:
    print(dirpath)
    for root, dirs, files in os.walk(dirpath):
        for f in files:
            if f.replace(".txt","") in filenamelist: ###
                data = mytool.read_webdata(os.path.join(root, f))
                # print(os.path.join(root, f))
                # 网页数据存入一个list
                target_data = mytool.get_all_webdata(data)
                #分词
                tmp_words = mytool.seg_sentence(target_data, stopwordslist)
                webdata_list.append(tmp_words)

print(webdata_list[0])
print("读取疑似网页内容共:" , len(webdata_list))


#构建词频矩阵,训练LDA模型
dictionary = corpora.Dictionary(webdata_list)
# corpus[0]: [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1),...]
# corpus是把每条新闻ID化后的结果,每个元素是新闻中的每个词语,在字典中的ID和频率
corpus = [dictionary.doc2bow(text) for text in webdata_list]

lda = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=3)
Exemple #4
0
j = 0
for subpath in fs:
    filepath = os.path.join(path, subpath)
    # print(filepath)
    if (os.path.isdir(filepath)):
        webdata_classtype = classtype[subpath]  # 查询父类别
        webdata_class_index = class_index[webdata_classtype]  #父类别下标
        webdata_path = os.listdir(filepath)
        for filename in webdata_path:
            i = i + 1
            webdata = mytool.read_webdata(os.path.join(filepath, filename))
            if webdata['title'] != "" and webdata[
                    'description'] != "" and webdata['keywords'] != "":
                if len(webdata['webtext']) >= 15:
                    j = j + 1
                    X_train_text.append(mytool.get_all_webdata(webdata))
                    Y_train.append(webdata_class_index)

print("已爬取网页数:")
print(i)
print("有效网页数:")
print(j)

# 2.4 将文本转为张量
# X_train 训练数据
X_train = []


# 将单词转为词向量的下标,下标从1开始 返回下标的list
def words2index(words):
    index_list = []