def getSimilarity(df_content_o): logging.debug('preparing docSim') raw_documents = list(df_content_o['content']) corpora_documents = [] for item_text in raw_documents: item_str = item_text.split(' ') corpora_documents.append(item_str) dictionary = corpora.Dictionary(corpora_documents) corpus = [dictionary.doc2bow(text) for text in corpora_documents] nf=len(set(itertools.chain.from_iterable(corpora_documents)))+1 similarity = Similarity('-Similarity-index', corpus, num_features=nf)#!!!!!!!!!!!!!!!!!!!!! similarity.num_best = max_similar_num return similarity,dictionary
def get_docsim_feature(contents, remarks=""): dictionary_path = Config.cache_dir + "/docsim/dic_%s.pkl" % remarks corpus_path = Config.cache_dir + "/docsim/corpus_%s.pkl" % remarks corpora_documents = [] tokenizer = Tokenizer() for item_text in contents: item_str = tokenizer(item_text) corpora_documents.append(item_str) dictionary = corpora.Dictionary(corpora_documents) corpus = [dictionary.doc2bow(text) for text in corpora_documents] similarity = Similarity('-Similarity-index', corpus, num_features=300) similarity.num_best = 3 pickle.dump(dictionary, open(dictionary_path, "wb"), protocol=4) pickle.dump(corpus, open(corpus_path, "wb"), protocol=4) return similarity, corpus
def doc2bow(): file = open("questions.txt", encoding='utf8') corpora_documents = [] # text1=[] text2 = [] lines = file.readlines() print(lines) for line in lines: # print(line) line_strip = line.strip('\n') # print(line_strip) text2.append(line_strip) # print("text2:"+str(text2)) text1 = del_stopword(line_strip) # text2=text1 # for x in jieba.lcut(line): # # print(x) # if x not in stopwords: # text1.append(x) # # str(text1).strip("\n") # text2=text1 corpora_documents.append(text1) # # print(corpora_documents) # text1=[] print(corpora_documents) # 生成字典:Dictionary(183 unique tokens: ['\n', '品种', '贷款', '贷款期限', '申请']...) dictionary = corpora.Dictionary(corpora_documents) # #判断对应的词典向量是否存在 if os.path.exists("E:\ITCC\mytest\dict.txt"): dictionary = Dictionary.load('dict.txt') #加载 else: dictionary.save('dict.txt') #保存生成的词典 dictionary = Dictionary.load('dict.txt') #加载 print(dictionary) # 生成向量语料:[[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1)], [(0, 1), (2, 1), (4, 1), (5, 1)]...] corpus = [dictionary.doc2bow(text) for text in corpora_documents] print(corpus) # 生成语料model便于后面使用 if os.path.exists("corpuse.mm"): corpus = corpora.MmCorpus('corpuse.mm') #加载 else: corpora.MmCorpus.serialize('corpuse.mm', corpus) #保存生成的语料 corpus = corpora.MmCorpus('corpuse.mm') #加载 # 生成对应的相似度模型:max_features:最大的特征数也可以理解为维度,也就是字典中单词数的最大值为多少 similarity = Similarity('-Similarity-index', corpus, num_features=400) test_data_1 = s test_cut_raw_1 = del_stopword(test_data_1) print(test_cut_raw_1) test_corpus_1 = dictionary.doc2bow(test_cut_raw_1) similarity.num_best = 5 print(similarity[test_corpus_1] ) # 返回最相似的样本材料,(index_of_document, similarity) tuples for sample in similarity[test_corpus_1]: index = sample[0] #相似度为1直接执行精确查找返回对应的答案 if sample[1] == 1: print("你的问题是:" + str(text2[int(index)]) + "相似度:" + str(sample[1])) break #相似度大于某个值则将代表该问题可能是用户想要询问的问题 elif sample[1] >= 0.8: print("你要问的问题是不是:" + str(text2[int(index)]) + "相似度:" + str(sample[1])) #否则就返回一组问题让用户挑选 else: print("相似的句子:" + str(text2[int(index)]) + "相似度:" + str(sample[1])) #实验效果 # for line in lines: # test_data_1 = line # test_cut_raw_1 =del_stopword(test_data_1) # print(test_cut_raw_1) # test_corpus_1 = dictionary.doc2bow(test_cut_raw_1) # similarity.num_best = 5 # print(similarity[test_corpus_1]) # 返回最相似的样本材料,(index_of_document, similarity) tuples print('################################')
DBName = "bullhorn" db = MySQLdb.connect(mySQLUrl, userName, passwd, DBName, charset='utf8', use_unicode=True) app = Flask(__name__) CORS(app) resultTuple = generateCorpus() dictionary = resultTuple['dictionary'] corpus = resultTuple['corpus'] socTitleDict = resultTuple['socTitleDict'] num_topics = 200 lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=num_topics) gensimIndex = Similarity('/tmp/tst', lsi[corpus], num_features=num_topics) gensimIndex.num_best = 3 @app.before_request def before_request(): db = MySQLdb.connect(mySQLUrl, userName, passwd, DBName, charset='utf8', use_unicode=True) resultTuple = generateCorpus() # dictionary = resultTuple['dictionary'] # corpus = resultTuple['corpus'] # socTitleDict = resultTuple['socTitleDict'] # # num_topics = 200 # lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=num_topics) # gensimIndex = Similarity('/tmp/tst', lsi[corpus], num_features=num_topics) # gensimIndex.num_best = 3 g.gensimIndex = gensimIndex
'15同居多年未办理结婚登记,是否可以向法院起诉要求离婚' ] corpora_documents = [] for item_text in raw_documents: #item_str = util_words_cut.get_class_words_list(item_text) item_str = list(jieba.cut(item_text)) corpora_documents.append(item_str) # 生成字典和向量语料 dictionary = corpora.Dictionary(corpora_documents) corpus = [dictionary.doc2bow(text) for text in corpora_documents] similarity = Similarity('-Similarity-index', corpus, num_features=400) test_data_1 = '你好,我想问一下我想离婚他不想离,孩子他说不要,是六个月就自动生效离婚' #test_cut_raw_1 = util_words_cut.get_class_words_list(test_data_1) test_cut_raw_1 = list(jieba.cut(test_data_1)) test_corpus_1 = dictionary.doc2bow(test_cut_raw_1) similarity.num_best = 5 print(similarity[test_corpus_1] ) # 返回最相似的样本材料,(index_of_document, similarity) tuples print('################################') test_data_2 = '家人因涉嫌运输毒品被抓,她只是去朋友家探望朋友的,结果就被抓了,还在朋友家收出毒品,可家人的身上和行李中都没有。现在已经拘留10多天了,请问会被判刑吗' #test_cut_raw_2 = util_words_cut.get_class_words_list(test_data_2) test_cut_raw_2 = list(jieba.cut(test_data_2)) test_corpus_2 = dictionary.doc2bow(test_cut_raw_2) similarity.num_best = 5 print(similarity[test_corpus_2] ) # 返回最相似的样本材料,(index_of_document, similarity) tuples
row = cursor.fetchone() while (row != None): print type(row[2]) doc = row[2] doclist = doc.lower().split() vec_bow = dictionary.doc2bow(doclist) num_topics = 200 lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=num_topics) vec_lsi = lsi[vec_bow] #Generate feature wordsId = [word[0] for word in vec_bow] wordsIdMap = zip(wordsId, vec_lsi) features = map(lambda x: dictionary.get(x[0]), sorted(wordsIdMap, key=lambda x: -x[1][1])) gensimIndex = Similarity('/tmp/tst', lsi[corpus], num_features=num_topics) gensimIndex.num_best = 3 sims = gensimIndex[vec_lsi] for item in sims: socCode = socTitleDict[item[0]] score = item[1] bullhornCode = row[0] featuresList = " ".join(features[0:10]).replace('\'', '') sortedVecList = sorted(vec_lsi, key=lambda x: -x[1]) top10feature = [str(round(vec[1], 5)) for vec in sortedVecList[0:10]] featureScoreStr = " ".join(top10feature) #Write to the database sql = ''' INSERT INTO jobtitlematch (bullhorn_job_id, jobtitle_id, score, featurelist, feature_score) VALUES ('%d', '%d', '%f','%s', '%s'); ''' % (bullhornCode, socCode, score, featuresList, featureScoreStr)