def tf(): dt = {} # if __name__ == '__main__': # 读入训练集 file_obj = FileObj(r"train_data.txt") train_sentences = file_obj.read_lines() # 读入测试集 file_obj = FileObj(r"test_data.txt") test1_sentences = file_obj.read_lines() # 分词工具,基于jieba分词,我自己加了一次封装,主要是去除停用词 seg = Seg() # 训练模型 ss = SentenceSimilarity(seg) ss.set_sentences(train_sentences) ss.TfidfModel() # tfidf模型 # ss.LsiModel() # lsi模型 # ss.LdaModel() # lda模型 # 测试集1 right_count = 0 # w=open("result510tf.txt",'w') # w.write(str("source_id") + '\t' + str("target_id") + '\n') for i in range(len(test1_sentences)): print "*********************" print i print test1_sentences[i] test = str(test1_sentences[i].encode("utf-8")) t = test.split(',')[0] dict = ss.similarity(test1_sentences[i]) # dict的key为句子的(序号-1),value为计算出的距离 for k, v in dict: print t, k + 1, v # 如2784 2784 1.0 ind2 = k + 1 if (str(k + 1) == str(t)): print "same" else: # w.write(str(t) + '\t' + str(k+1) + '\n') addtodict2(dt, int(t), int(ind2), v) # w.close() return dt
def run_prediction(input_file_path, output_file_path): # 读入训练集 file_obj = FileObj(r"./TFIDF_baseline/dataSet/trainQuestions.txt") train_sentences = file_obj.read_lines() # 读入测试集 file_obj = FileObj(input_file_path) test_sentences = file_obj.read_lines() # 分词工具,基于jieba分词,并去除停用词 seg = Seg() # 训练模型 ss = SentenceSimilarity(seg) ss.set_sentences(train_sentences) ss.TfidfModel() # tfidf模型 # 测试集 right_count = 0 file_result=open(output_file_path,'w') with open("./TFIDF_baseline/dataSet/trainAnswers.txt",'r',encoding = 'utf-8') as file_answer: line = file_answer.readlines() for i in range(0,len(test_sentences)): top_15 = ss.similarity(test_sentences[i]) ''' for j in range(0,len(top_15)): answer_index=top_15[j][0] answer=line[answer_index] file_result.write(str(top_15[j][1])+'\t'+str(answer)) file_result.write("\n") ''' file_result.write(line[top_15[0][0]]+'\n') file_result.close() file_answer.close()
# encoding=utf-8 from Segment import * from fileObject import FileObj from sentenceSimilarity import SentenceSimilarity from sentence import Sentence from time import clock if __name__ == '__main__': start = clock() # 读入后半部分语料 file_obj = FileObj(r"sentence2.txt") train_sentences = file_obj.read_lines() # 读入前半部分语料 file_obj = FileObj(r"sentence1.txt") test1_sentences = file_obj.read_lines() # 分词工具,基于jieba分词,加了一次封装,主要是去除停用词 seg = Seg() # 生成模型 ss = SentenceSimilarity(seg) ss.set_sentences(train_sentences) #ss.TfidfModel() # tfidf模型 ss.LsiModel() # lsi模型 #ss.LdaModel() # lda模型 # 计算句子相似度 # for i in range(0,len(train_sentences)/100): # mysims = ss.mysimilarity(test1_sentences[i*100])
#encoding=utf-8 from fileObject import FileObj,Seg from sentenceSimilarity import SentenceSimilarity from sentence import Sentence if __name__ == '__main__': # 读入训练集 file_obj = FileObj(r"D:/Github Project/sentence Similarity/testSet/trainSet.txt") train_sentences = file_obj.read_lines() # 读入测试集1 file_obj = FileObj(r"D:/Github Project/sentence Similarity/testSet/testSet1.txt") test1_sentences = file_obj.read_lines() # 读入测试集2 file_obj = FileObj(r"D:/Github Project/sentence Similarity/testSet/testSet2.txt") test2_sentences = file_obj.read_lines() # 分词工具,基于jieba分词,我自己加了一次封装,主要是去除停用词 seg = Seg() # 训练模型 ss = SentenceSimilarity(seg) ss.set_sentences(train_sentences) ss.TfidfModel() # tfidf模型 # ss.LsiModel() # lsi模型 # ss.LdaModel() # lda模型 # 测试集1 right_count = 0
#encoding=utf-8 from zhcnSegment import * from fileObject import FileObj from sentenceSimilarity import SentenceSimilarity from sentence import Sentence if __name__ == '__main__': # 读入训练集 file_obj = FileObj(r"testSet/trainSet.txt") train_sentences = file_obj.read_lines() # 读入测试集1 file_obj = FileObj(r"testSet/testSet1.txt") test1_sentences = file_obj.read_lines() # 读入测试集2 file_obj = FileObj(r"testSet/testSet2.txt") test2_sentences = file_obj.read_lines() # 分词工具,基于jieba分词,我自己加了一次封装,主要是去除停用词 seg = Seg() # 训练模型 ss = SentenceSimilarity(seg) ss.set_sentences(train_sentences) ss.TfidfModel() # tfidf模型 # ss.LsiModel() # lsi模型 # ss.LdaModel() # lda模型 # 测试集1
#encoding=utf-8 from cutWords import Seg from fileObject import FileObj from sentenceSimilarity import SentenceSimilarity from sentence import Sentence import time from time import ctime import threading file_obj = FileObj(r"dataSet/train_q.txt") train_sentences = file_obj.read_lines() with open("dataSet/train_a.txt", 'r', encoding='utf-8') as file_answer: line = file_answer.readlines() seg = Seg() # 训练模型 ss1 = SentenceSimilarity(seg) ss1.set_sentences(train_sentences) ss1.TfidfModel() # tfidf模型 ss2 = SentenceSimilarity(seg) ss2.set_sentences(train_sentences) ss2.LsiModel() # LSI模型 def tfidf_model(sentence): top = ss1.similarity(sentence) answer_index = top[0][0] answer = line[answer_index] return top[0][1], answer
if it[0] not in uselessTag: if not useStopWord: word_list.append(tagdict['word'][index]) elif tagdict['word'][index] not in self.stopwords: word_list.append(tagdict['word'][index]) return word_list def cut(self,sentences): """ 分词 :param sentences:需要分词的语料集 :return: 去噪后的单词list """ tags=self.get_tags(sentences) cutedSentences=[] for sentence in tags: cutedSentences.append(self.denoisingOne(sentence)) return cutedSentences def depenPars(self,sentences): return self.nlp.depparser(sentences) if __name__=="__main__": from fileObject import FileObj Fobj=FileObj(r"testSet/trainSet.txt") scentences1 = Fobj.read_lines() cutTool=CNSegment() lst=cutTool.depenPars(scentences1[0]) print json.dumps(lst,encoding="UTF-8", ensure_ascii=False)
from collections import deque import random import string from zhon.hanzi import punctuation import math NO_PUNCTUATION = False # 去掉每句话首尾的标点 RANDRANGE = 1 MORE_HAPPY = True BAD_EMO = ['死', '亡'] WINDOW_SIZE = 2 if __name__ == '__main__': # 读入训练集 file_obj = FileObj(r"dataSet/train.txt") train_sentences = file_obj.read_lines() train_sentences_len = len(train_sentences) # 读入测试集 file_obj = FileObj(r"dataSet/test_keywords.txt") test_sentences = file_obj.read_lines() # 分词工具,基于jieba分词,并去除停用词 seg = Seg() # # 训练模型 # ss = SentenceSimilarity(seg) # ss.set_sentences(train_sentences)
#encoding=utf-8 from cutWords import * from fileObject import FileObj from sentenceSimilarity import SentenceSimilarity from sentence import Sentence if __name__ == '__main__': # 读入训练集 file_obj = FileObj(r"dataSet/trainQuestions.txt") train_sentences = file_obj.read_lines() # 读入测试集 file_obj = FileObj(r"dataSet/devQuestions.txt") test_sentences = file_obj.read_lines() # 分词工具,基于jieba分词,并去除停用词 seg = Seg() # 训练模型 ss = SentenceSimilarity(seg) ss.set_sentences(train_sentences) ss.TfidfModel() # tfidf模型 # 测试集 right_count = 0 file_result = open('dataSet/result.txt', 'w') with open("dataSet/trainAnswers.txt", 'r', encoding='utf-8') as file_answer: line = file_answer.readlines()
#encoding=utf-8 from Segment import * from fileObject import FileObj from sentenceSimilarity import SentenceSimilarity from sentence import Sentence from time import clock if __name__ == '__main__': start = clock() for num in range(1, 81): # 读入后半部分语料 file_obj = FileObj("sentence2_" + str(num) + ".txt") train_sentences = file_obj.read_lines() # 读入前半部分语料 file_obj = FileObj("sentence1_" + str(num) + ".txt") test1_sentences = file_obj.read_lines() # 分词工具,基于jieba分词,加了一次封装,主要是去除停用词 seg = Seg() # 生成模型 ss = SentenceSimilarity(seg) ss.set_sentences(train_sentences) #ss.TfidfModel() # tfidf模型 ss.LsiModel() # lsi模型 #ss.LdaModel() # lda模型
#encoding=utf-8 from zhcnSegment import * from fileObject import FileObj from sentenceSimilarity import SentenceSimilarity from sentence import Sentence if __name__ == '__main__': # 读入训练集 file_obj = FileObj(r"testSet/result0.txt") train_sentences = file_obj.read_lines() # 读入测试集1 file_obj = FileObj(r"testSet/test0.txt") test1_sentences = file_obj.read_lines() # 读入测试集2 #file_obj = FileObj(r"testSet/testSet2.txt") #test2_sentences = file_obj.read_lines() # 分词工具,基于jieba分词,我自己加了一次封装,主要是去除停用词 seg = Seg() # 训练模型 ss = SentenceSimilarity(seg) ss.set_sentences(train_sentences) ss.TfidfModel() # tfidf模型 #ss.LsiModel() # lsi模型 #ss.LdaModel() # lda模型 # 测试集1
#encoding=utf-8 from zhcnSegment import * from fileObject import FileObj from sentenceSimilarity import SentenceSimilarity from sentence import Sentence if __name__ == '__main__': # 读入训练集 file_obj = FileObj(r"testSet/trainSet.txt") train_sentences = file_obj.read_lines() # 读入测试集1 file_obj = FileObj(r"testSet/testSet1.txt") test1_sentences = file_obj.read_lines() # 读入测试集2 file_obj = FileObj(r"testSet/testSet2.txt") test2_sentences = file_obj.read_lines() # 分词工具,基于jieba分词,我自己加了一次封装,主要是去除停用词 seg = Seg() # 训练模型 ss = SentenceSimilarity(seg) ss.set_sentences(train_sentences) ss.TfidfModel() # tfidf模型 # ss.LsiModel() # lsi模型 # ss.LdaModel() # lda模型 # 测试集1
#encoding=utf-8 from zhcnSegment import * from fileObject import FileObj from sentenceSimilarity import SentenceSimilarity from sentence import Sentence if __name__ == '__main__': # 读入训练集 file_obj = FileObj(r"testSet/data2") train_sentences = file_obj.read_lines() # 读入测试集1 file_obj = FileObj(r"testSet/testSet3") test1_sentences = file_obj.read_lines() # 读入测试集2 #file_obj = FileObj(r"testSet/testSet2.txt") #test2_sentences = file_obj.read_lines() # 分词工具,基于jieba分词,我自己加了一次封装,主要是去除停用词 seg = Seg() # 训练模型 ss = SentenceSimilarity(seg) ss.set_sentences(train_sentences) #ss.TfidfModel() # tfidf模型 #ss.LsiModel() # lsi模型 #ss.LdaModel() # lda模型 ss.FasttxModel()
#encoding=utf-8 from zhcnSegment import * from fileObject import FileObj from sentenceSimilarity import SentenceSimilarity from sentence import Sentence if __name__ == '__main__': # 读入训练集 file_obj = FileObj(r"testSet/tjmsnew.txt") train_sentences = file_obj.read_lines() file_obj = FileObj(r"testSet/zhenduanxx-utf.txt") test1_sentences = file_obj.read_lines() #test1_sentences = "子宫 肌瘤" # 分词工具,基于jieba分词,主要是去除停用词 seg = Seg() # 训练模型 ss = SentenceSimilarity(seg) ss.set_sentences(train_sentences) #ss.TfidfModel() # tfidf模型 #ss.LsiModel() # lsi模型 ss.LdaModel() # lda模型 #ss.W2Vmodel() for j in range(0, len(test1_sentences)): sentence = ss.similarity(test1_sentences[j], j) ''' # 测试集1 right_count = 0
# import modules & set up logging import gensim, logging from fileObject import FileObj from gensim.models import Word2Vec logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) if __name__ == '__main__': file_obj = FileObj(r"testSet/data") sentences = file_obj.read_lines_1_words() #model = Word2Vec(sentences, sg=1, size=100, window=5, min_count=5, negative=3, sample=0.001, hs=1, workers=4) #model.save('w2v_model') model = Word2Vec.load('w2v_model') print(model.most_similar(['怀孕'])) print(model.similarity('怀孕', '孕妇'))
#encoding=utf-8 from zhcnSegment import * from fileObject import FileObj from sentenceSimilarity import SentenceSimilarity from sentence import Sentence if __name__ == '__main__': # 读入训练集 file_obj = FileObj(r"testSet/num-tjms.txt") train_sentences = file_obj.read_lines() test1_sentences=input("请输入关键词:") #test1_sentences = "T波 异常" # 分词工具,基于jieba分词,我自己加了一次封装,主要是去除停用词 seg = Seg() # 训练模型 ss = SentenceSimilarity(seg) ss.set_sentences(train_sentences) ss.TfidfModel() # tfidf模型 # ss.LsiModel() # lsi模型 # ss.LdaModel() # lda模型 # 测试集1 right_count = 0 sentence = ss.similarity(test1_sentences) print (sentence.origin_sentence)