Ejemplo n.º 1
0
test_case = fp.get_pickle_data('../data_mid/keyword_tfidf_200.pkl')['test']
del data

model = load_model('../data_model/lstm_model_plus1.h5')
cace_i = 0
result = []
for cace in caces:
    case_input = [cace] * len(laws)
    pred = model.predict({
        'case_input': np.array(case_input),
        'law_input': np.array(laws)
    })
    pred = pred[:, 0]
    # 选择得分最高的前5个法条
    cur_index = np.where(pred >= 0.85)[0]
    cur_val = pred[cur_index]
    sort_val = sorted([(cur_index[i] + 1, cur_val[i])
                       for i in range(len(cur_index))],
                      key=lambda k: k[1],
                      reverse=True)
    tmp_result = [val for val in sort_val[0:4]]
    result.append(tmp_result)
    # 每隔20次存结果
    if cace_i % 20 == 0:
        fp.save_pickle_data('../data_result/result1.h5', result)
    # 输出信息
    print(test_case[cace_i])
    print(cace_i, tmp_result)
    print('-----------------------------')
    cace_i += 1
Ejemplo n.º 2
0
dictionary = corpora.Dictionary(texts)
# 根据词袋构建文档向量
corpus = [dictionary.doc2bow(text) for text in texts]
# 训练tf-idf模型
tfidf = models.TfidfModel(corpus)
# 获取文档的所有文档的tf-idf
# text_tfidf数据格式: [[(index1, tf-idf1), (index2, tf-idf2), (index3, tf-idf3)], [...], ...]
# text_index数据格式:{index1:word1, index2:word2, index3:word3,}
text_tfidf = tfidf[corpus]
text_index = dict([val, key] for key, val in (dictionary.token2id).items())
save_midle = {
    'text_tfidf': text_tfidf,
    'text_index': text_index
}
#保存模型
fp.save_pickle_data('../data_result/tf_idf_modle.pkl', save_midle)
tf_id_midle = fp.get_pickle_data('../data_result/tf_idf_modle.pkl')
# text_tfidf = tf_id_midle['text_tfidf']
# text_index = tf_id_midle['text_index']
#
# # 找出每个文档中最大的前100个词
# len_train_word = 40000
# word_num = 100
# max_result = []
#
# for doc_tfidf in text_tfidf:
#     tmp_text_word = []
#     sorted_list = sorted(doc_tfidf, key=lambda d: d[1], reverse=True)
#     for key, value in sorted_list[0:word_num]:
#         tmp_text_word.append(text_index[key])
#     max_result.append(tmp_text_word)
Ejemplo n.º 3
0
        tmp_word_set.add(key)
    j = 0
    # 将其他词按照tfidf排序并加入到结果
    sorted_list = sorted(text_tfidf[i], key=lambda d: d[1], reverse=True)

    while len(tmp_word_set) < word_num and j < len(sorted_list):
        word = index_word[sorted_list[j][0]]
        if len(word) > 1 and not contain_number.match(word):
            tmp_word_set.add(sorted_list[j][0])
        j += 1
    result = []
    for key, value in text_tfidf[i]:
        if key in tmp_word_set:
            result.append(index_word[key])
    keyword_tfidf_result.append(result)
    print(i, len(result), result)
    print(
        '-------------------------------------------------------------------------------------------------------'
    )

print(len(keyword_tfidf_result))
print(len(key_word))

keyword_tfidf_result = {
    'train': keyword_tfidf_result[:train_len],
    'test': keyword_tfidf_result[train_len:],
}

fp.save_pickle_data('../data_mid/keyword_tfidf_200_clean.pkl',
                    keyword_tfidf_result)
Ejemplo n.º 4
0
#
# fp.save_pickle_data('../data_feature/train_punish2.pkl', train)

# 构造测试集的特征
test_texts = fp.get_pickle_data('../data_mid/keyword_tfidf_200_clean.pkl')
test_texts = test_texts['test']
word_index = fp.get_pickle_data('../data_result/word_embedding.pkl')
word_index = word_index['word_index']

test = []
max_num = max([len(texts) for texts in test_texts])
print(max_num)
for i in range(len(test_texts)):
    text_feature = np.zeros(max_num)
    for j in range(len(test_texts[i])):
        text_feature[j] = word_index[test_texts[i][j]]
    test.append(text_feature)

file = codecs.open("../data_origin/test.txt", "rb", "utf-8")
test_id = []
for line in file:
    tmp = re.split('\t|\n', line)
    test_id.append(tmp[0])

file.close()

print(len(test_id), len(test))
print(test_id[0], test[0])
test = {'id': test_id, 'value': test}
fp.save_pickle_data('../data_feature/test_punish2.pkl', test)
Ejemplo n.º 5
0
    tmp_frature = [0]*max_num
    for i in range(len(law)):
        tmp_frature[i] = word_index[law[i]]
    law_vector.append(np.array(tmp_frature))

# 处理案情
case_vector = []
mutiple = 4
law_num = 452
max_num = max([len(texts) for texts in test_texts])
print(max_num)
# len(train_texts)
for i in range(len(test_texts)):
    text_feature = np.zeros(max_num)
    for j in range(len(test_texts[i])):
        text_feature[j] = word_index[test_texts[i][j]]
    case_vector.append(text_feature)
print(case_vector[0:5])
result = {
    'law_vector': law_vector,
    'case_vector': case_vector
}
fp.save_pickle_data('../data_feature/test.pkl', result)







Ejemplo n.º 6
0
    ' ', '', '。', ',', '-', ':', '“', '”', '"', '‘', '’', '!', '(', ')', '~',
    '、', ',', '.', '(', ')', ';', ';', ':', '?', '?', '~', '《', '》', '<', '>',
    '──', '─', '…', '……', '[', ']', '【', '】', '~', ']', ']'
]
for line in file:
    tmp = re.split('\t|\n', line)
    word.append(
        [word for word in jieba.cut(tmp[1]) if word not in stop_word_list])
    panish_class.append(int(tmp[2]))
    law.append([int(num) for num in tmp[3].split(',')])
file.close()
print(word[666])
print(panish_class[666])
print(law[666])
result = {'word': word, 'panish_class': panish_class, 'law': law}
fp.save_pickle_data('data_mid/train_data.pkl', result)
# ————————————————————————————————————————————————————————————————————————————
file = codecs.open("data_origin/test.txt", "rb", "utf-8")
word = []
stop_word_list = [
    ' ', '', '。', ',', '-', ':', '“', '”', '"', '‘', '’', '!', '(', ')', '~',
    '、', ',', '.', '(', ')', ';', ';', ':', '?', '?', '~', '《', '》', '<', '>',
    '──', '─', '…', '……', '[', ']', '【', '】', '~', ']', ']'
]
for line in file:
    tmp = re.split('\t|\n', line)
    word.append(
        [word for word in jieba.cut(tmp[1]) if word not in stop_word_list])
file.close()
print(word[666])
fp.save_pickle_data('data_mid/test_data.pkl', word)
Ejemplo n.º 7
0
from utils import file_pickle as fp

train = fp.get_pickle_data('../data_mid/train_data.pkl')
texts = train['word']
law = train['law']
panish_class = train['panish_class']

new_word = []
new_law = []
new_panish_class = []
for i in range(len(texts)):
    if len(texts[i]) >= 100:
        new_word.append(texts[i])
        new_law.append(law[i])
        new_panish_class.append(panish_class[i])

result = {'word': new_word, 'panish_class': new_panish_class, 'law': new_law}
fp.save_pickle_data('../data_mid/train_data.pkl', result)
Ejemplo n.º 8
0
del train_word, test_word, law_word

# 将词打上索引
i = 1
word_index = {}
for text in texts:
    for word in text:
        if word not in word_index:
            word_index[word] = i
            i += 1

# 把索引和向量结合
model = gensim.models.Word2Vec.load('../data_result/word2vec_100.model')
index_vector = {}
for word in word_index.keys():
    index_vector[word_index[word]] = model[word]
print(len(index_vector))

# 从索引为1的词语开始,用词向量填充矩阵
embedding_weights = np.zeros((len(index_vector) + 1, 100))
for index, w in index_vector.items():
    embedding_weights[index, :] = w
print(embedding_weights[0:3])

result = {
    'word_index': word_index,
    'index_vector': index_vector,
    'embedding_weights': embedding_weights
}
fp.save_pickle_data('../data_result/word_embedding.pkl', result)