Esempio n. 1
0
def clean1_3(book_labels):
    print('book_label 1.3 清洗开始:', book_labels.shape,
          '---------------------------')
    # 1.3 去掉字典(词袋)中不存在的标签
    indexbyWordBag = []
    labelsvector = []
    for i in range(book_labels.shape[0]):
        try:
            labelvector = syn.v(book_labels.iloc[i, 1])
            # print(labelvector,len(labelvector))
            # 若词袋中存在该词,保留index
            labelsvector.append(labelvector)
            indexbyWordBag.append(i)
        except KeyError as err:
            # 若词袋中不存在该词,删除
            continue
    book_labels = book_labels.iloc[indexbyWordBag]
    book_labels = book_labels.reset_index(drop=True)
    labelvector = pd.DataFrame(labelsvector)
    book_labels = pd.concat([book_labels, labelvector],
                            ignore_index=True,
                            axis=1)
    print('book_label 1.3 清洗结束:', book_labels.shape,
          '---------------------------')  # 10118,102
    return book_labels
Esempio n. 2
0
    def synonym_search(self, question, method):
        n_grams = [2, 3, 4, 5]
        term = []
        scores = []
        for n in n_grams:
            for i in range(len(question) - n + 1):
                word = question[i:i + n]
                if method == 'Bert':
                    wd_vec = bc.encode([word])[0]

                elif method == 'Word2Vec':
                    try:
                        wd_vec = synonyms.v(word)
                    except:
                        wd_vec = [0] * 100
                else:

                    wd_vec = 0
                term.append((word, wd_vec))

        for word, wd_vec in term:
            max_score = 0
            for wd, vec in self.symptom_vec.items():

                if method == 'Bert':
                    score = np.inner(wd_vec, vec) / \
                        (np.linalg.norm(wd_vec)*np.linalg.norm(vec))
                    score = score
                if method == "Levenshtein":
                    score = Levenshtein.jaro(word, wd)
                if method == "Word2Vec":
                    try:
                        vec = self.symptom_w2v_vec[wd]

                    except:
                        vec = [0] * 100
                    score = np.inner(wd_vec, vec) / \
                        ((np.linalg.norm(wd_vec)+1e-5)*(np.linalg.norm(vec)+1e-5))
                if score >= max_score:
                    max_score = score
                    prob_entity = wd
                    original_word = word
            scores.append((original_word, max_score, prob_entity))
        scores.sort(key=takeSecond, reverse=True)
        # prob_entity = scores[0][2]
        # original_word =scores[0][0]
        # print(method)
        return scores[:3]
Esempio n. 3
0
 def test_word_vector(self):
     print("test_word_vector")
     word = "三国"
     print(word, "向量", synonyms.v(word))
# -*- "coding: utf-8" -*-

import synonyms
import numpy as np
from cosine import Cosine

cosine = Cosine(n_recommendation=4)

with open("vocabulary_filter.txt", "r", encoding="utf-8") as f:
    vocabulary = f.read().split()[:-1]

vectors = []
for word in vocabulary:
    try:
        vectors.append(synonyms.v(word))  # 使用 synonyms 获得词向量
    except:
        pass

vectors = np.array(vectors)

indices, similarities = cosine.cal_similarity(vectors,
                                              vectors)  # 调用cosine模块计算余弦相似度

with open("method_synonyms.csv", "w", encoding="utf-8") as f:
    for nrow, row in enumerate(indices):
        for ncol, col in enumerate(row):
            if ncol == 0:  # 跳过自身
                continue
            f.write("{},{},{}\n".format(vocabulary[nrow], vocabulary[col],
                                        similarities[nrow][ncol]))
Esempio n. 5
0
def cleanData(book_labels_table):
    '''
       #对图书标签进行预处理,清洗规则如下:
       0. 只保留中文文字描述的标签
       1. 去掉四个汉字以上的标签
       2. 去掉没有标签的图书,返回图书id,存入deletebookid=[]
       3. 去掉一本书对应的重复标签
       :param book_labels_table:
       :return: book_labels_table
       '''
    #先去除没有标签的图书
    book_labels_table=delteNanRow(book_labels_table)
    # 存放图书-向量的图书id列表和图书标签向量列表

    print('--------------------下面开始去除四个以上的汉字和不在词典中的汉字---------------------------')

    bookidlist=[]
    booklabelsvector=[]

    # 1. 去掉四个字以上的标签(只保留标签长度为2-4个汉字的),且将繁体标签转换为简体标签,去掉重复的标签
    print('去掉四个字以上的标签(只保留标签长度为2-4个汉字的),且将繁体标签转换为简体标签,去掉重复的标签')
    for i in range(book_labels_table.shape[0]):
        #item_book_label存放清洗后的label
        item_book_label =[]
        for j in range(len(book_labels_table.iloc[i,1])):
            pattern=re.compile("^[\u4e00-\u9fa5]{2,4}$")
            if re.match(pattern, book_labels_table.iloc[i,1][j]):
                #化繁体为简体
                # print('化繁体为简体')
                book_labels_table.iloc[i,1][j]=Traditional2Simplified(book_labels_table.iloc[i,1][j])
                item_book_label.append(book_labels_table.iloc[i,1][j])
        #去重复的词
        item_book_label=set(item_book_label)
        book_labels_table.iloc[i,1]=list(item_book_label)

    #经过一轮清洗后,可能会出现没有标签的图书,去除没有标签的图书
    print('经过一轮清洗后,可能会出现没有标签的图书,去除没有标签的图书')
    book_labels_table=delteNanRow(book_labels_table)

    # 第二轮去除我们词袋中没有出现的词
    print('第二轮去除我们词袋中没有出现的词')
    for i in range(book_labels_table.shape[0]):
        # item_book_labelVector存放清洗后标签对应的词向量
        item_book_label=[]
        item_book_labelVector = []
        # ind_list用来更新book_labels,有些一轮清洗的留下来的词可能并不在我们的词袋中,我们也需要删除它
        #注意:这样又会引出一个新的问题——可能会再次出现一本书没有标签,我们需要再做依次删除空行操作
        for j in range(len(book_labels_table.iloc[i, 1])):
            try:
                wordvector = synonyms.v(book_labels_table.iloc[i, 1][j])
            except KeyError as err:
                #若词袋中不存在该词,删除
                continue
            #若词袋中存在该词,先存入该词,接着存入该词向量
            item_book_label.append(book_labels_table.iloc[i, 1][j])
            item_book_labelVector.append(wordvector)
        #更新每本书对应的标签
        book_labels_table.iloc[i,1]=item_book_label

        # 得到该本书的图书id,建立图书id-图书标签向量表
        # print('得到该本书的图书id,建立图书id-图书标签向量表')
        if len(book_labels_table.iloc[i,1]) != 0:
            bookidlist.append(book_labels_table.iloc[i,0])
            booklabelsvector.append(item_book_labelVector)

    book_labels_table=delteNanRow(book_labels_table)
    book_labels_vector_table=pd.DataFrame(zip(bookidlist,booklabelsvector))
    book_labels_vector_table.columns=['bookid','book_labels_vector']

    print(book_labels_table)
    #至此得到图书id-图书标签向量表


    print(book_labels_vector_table)
    print('入库')
    book_labels_table.index=book_labels_table['bookid']
    del book_labels_table['bookid']
    print(book_labels_table)
    book_labels_vector_table.index=book_labels_vector_table['bookid']
    del book_labels_vector_table['bookid']
    print(book_labels_vector_table)
    return book_labels_table,book_labels_vector_table
data_long

("国际劳工组织")
cixing=[]
for i in range(0,len(data_long)):
    cixing.append(synonyms.seg(data_long[i]))




test=synonyms.nearby("人脸")
test[0]
print("识别: %s" % (synonyms.nearby("识别")))
print("NOT_EXIST: %s" % (synonyms.nearby("NOT_EXIST")))
synonyms.display("金融")
synonyms.v()
print(1)


cixiangliang=[]
for i in range(0,len(data_long)):
    try:
        cixiangliang.append(synonyms.v (data_long[i]))
    except:
        cixiangliang.append(-1)



ciqinggan=[]
for i in range(0,len(data_long)):
    s = SnowNLP(data_long[i])
Esempio n. 7
0
def testWordVec(word):
    # 得出synonyms中这个词的向量
    wordvector = synonyms.v(word)
    # print(wordvector)
    return wordvector
Esempio n. 8
0
def word2vector(word):
    try:
        vector = synonyms.v(word)
    except Exception as e:
        vector = None
    return vector