def main(): sentences = ReadFile.readTXTFile(config.SimilarlySentencePath + "AllQueriesWithID.txt") test_sentences_doc = ReadFile.readTXTFile(config.SimilarlySentencePath + "corpus_0829.txt") test_sentences = [] for sen in test_sentences_doc: sen_iterms = sen.strip().split("\t") if len(sen_iterms) >= 2: test_sentences.append(sen_iterms[1]) # train_sentences = [ # '0无偿居间介绍买卖毒品的行为应如何定性', # '1吸毒男动态持有大量毒品的行为该如何认定', # '2如何区分是非法种植毒品原植物罪还是非法制造毒品罪', # '3为毒贩贩卖毒品提供帮助构成贩卖毒品罪', # '4将自己吸食的毒品原价转让给朋友吸食的行为该如何认定', # '5为获报酬帮人购买毒品的行为该如何认定', # '6毒贩出狱后再次够买毒品途中被抓的行为认定', # '7虚夸毒品功效劝人吸食毒品的行为该如何认定', # '8妻子下落不明丈夫又与他人登记结婚是否为无效婚姻', # '9一方未签字办理的结婚登记是否有效', # '10夫妻双方1990年按农村习俗举办婚礼没有结婚证 一方可否起诉离婚', # '11结婚前对方父母出资购买的住房写我们二人的名字有效吗', # '12身份证被别人冒用无法登记结婚怎么办?', # '13同居后又与他人登记结婚是否构成重婚罪', # '14未办登记只举办结婚仪式可起诉离婚吗', # '15同居多年未办理结婚登记,是否可以向法院起诉要求离婚' # ] # print type(train_sentences[0]) # print len(sentences) train_sentences = [] for sen in sentences: sen_iterms = sen.split("\t") # print sen_iterms[1] if len(sen_iterms) >= 2: # print sen_iterms[1].strip().replace(" ","") train_sentences.append(sen_iterms[1].strip().replace(" ","")) print type(train_sentences[0]) # # print "build simi_model" SSO = SentenceSimilarlyObj() # corpus = SSO.getCorpus(train_sentences) # SSO.setSimilar(corpus=corpus) # print "save simi model" # SSO.save() # SSO.save("simi_model_little","word_dic_little") # print "build success" print "load model" SSO.load() # print SSO.similar print "test" # indexs = SSO.calSentenceSimilarly(sentence=u"说说后天是礼拜几") # for index in indexs: # print index[0],train_sentences[index[0]],index[1] result = SSO.calSentencesSimilarly(train_sentences,train_sentences) Wr = WriteResult() can_not_deal = Wr.WriteSimilarlySentence(result,"docSim_simi.txt")
def main(): sentences = ReadFile.readTXTFile(config.SimilarlySentencePath + "AllQueriesWithID.txt") test_sentences_doc = ReadFile.readTXTFile(config.SimilarlySentencePath + "corpus_0829.txt") test_sentences = [] for sen in test_sentences_doc: sen_iterms = sen.strip().split("\t") if len(sen_iterms) >= 2: test_sentences.append(sen_iterms[1]) # train_sentences = [ # '0无偿居间介绍买卖毒品的行为应如何定性', # '1吸毒男动态持有大量毒品的行为该如何认定', # '2如何区分是非法种植毒品原植物罪还是非法制造毒品罪', # '3为毒贩贩卖毒品提供帮助构成贩卖毒品罪', # '4将自己吸食的毒品原价转让给朋友吸食的行为该如何认定', # '5为获报酬帮人购买毒品的行为该如何认定', # '6毒贩出狱后再次够买毒品途中被抓的行为认定', # '7虚夸毒品功效劝人吸食毒品的行为该如何认定', # '8妻子下落不明丈夫又与他人登记结婚是否为无效婚姻', # '9一方未签字办理的结婚登记是否有效', # '10夫妻双方1990年按农村习俗举办婚礼没有结婚证 一方可否起诉离婚', # '11结婚前对方父母出资购买的住房写我们二人的名字有效吗', # '12身份证被别人冒用无法登记结婚怎么办?', # '13同居后又与他人登记结婚是否构成重婚罪', # '14未办登记只举办结婚仪式可起诉离婚吗', # '15同居多年未办理结婚登记,是否可以向法院起诉要求离婚' # ] # print type(train_sentences[0]) # print len(sentences) train_sentences = [] for sen in sentences: sen_iterms = sen.split("\t") # print sen_iterms[1] if len(sen_iterms) >= 2: # print sen_iterms[1].strip().replace(" ","") train_sentences.append(sen_iterms[1].strip().replace(" ", "")) print type(train_sentences[0]) # # print "build simi_model" SSO = SentenceSimilarlyObj() corpus = SSO.getCorpus(train_sentences) SSO.setSimilar(corpus=corpus) print "save simi model" # SSO.save() # SSO.save("simi_model_little","word_dic_little") # print "build success" print "load model" SSO.load() # print SSO.similar print "test"
def analysisSimilary_Word(model, file=MyCode.config.CorpusFilePath + "favor0721.xlsx", sheet='节日210'): sentences, R_S = ReadFile.getOneSheetContext(file, sheet) cla_Key_words = [] for sentence in sentences: # print sentence key_sentence = jieba.analyse.extract_tags(sentence) for w in key_sentence: cla_Key_words.append(w) cla_Key_words = list(set(cla_Key_words)) all_key_words = [] for word in cla_Key_words: try: if model.similarity(str("节日"), str(word)) < 0.4: continue all_key_words.append(word) simi_words = model.most_similar(str(unicode(word))) except KeyError: continue for w in simi_words: all_key_words.append(w[0]) all_key_words = list(set(all_key_words)) all_key_words = extend_Word(model, all_key_words) # write to file with open(MyCode.config.ResultFilePath + "Topic_festival.txt", 'w') as fp: for key_word in all_key_words: fp.writelines(key_word + "\n")
def getNewWords(self): file = config.WordDicPath + "birds.txt" lines = ReadFile.readTXTFile(file) words = [] for line in lines: words.extend(line.strip().split(" ")) return words
def analysisSimilary_Word(model, file=MyCode.config.CorpusFilePath + "favor0721.xlsx", sheet='节日210'): sentences,R_S = ReadFile.getOneSheetContext(file, sheet) cla_Key_words = [] for sentence in sentences: # print sentence key_sentence = jieba.analyse.extract_tags(sentence) for w in key_sentence: cla_Key_words.append(w) cla_Key_words = list(set(cla_Key_words)) all_key_words = [] for word in cla_Key_words: try: if model.similarity(str("节日"),str(word)) < 0.4: continue all_key_words.append(word) simi_words = model.most_similar(str(unicode(word))) except KeyError: continue for w in simi_words: all_key_words.append(w[0]) all_key_words = list(set(all_key_words)) all_key_words = extend_Word(model,all_key_words) # write to file with open(MyCode.config.ResultFilePath+ "Topic_festival.txt", 'w') as fp: for key_word in all_key_words: fp.writelines(key_word+"\n")
def MyTest(): print "1" filename = config.SimilarlySentencePath + "AllQueriesWithID.txt" sentences = ReadFile.readTXTFile(filename) # sentences = ReadFile.readTXTFile(config.SimilarlySentencePath + "AllQueriesWithID.txt") # test_sentences_doc = ReadFile.readTXTFile(config.SimilarlySentencePath + "corpus_0829.txt") test_sentences = [] # for sen in test_sentences_doc: # sen_iterms = sen.strip().split("\t") # if len(sen_iterms) >= 2: # test_sentences.append(sen_iterms[1]) train_sentences = [] for sen in sentences: sen_iterms = sen.split("\t") # print sen_iterms[1] if len(sen_iterms) >= 2: # print sen_iterms[1].strip().replace(" ","") train_sentences.append(sen_iterms[1].strip().replace(" ", "")) print type(train_sentences[0]) test_sentences = train_sentences tsf = Ranker() tsf.load(config.SimilarlySentencePath + "AllQueriesWithID.txt") result = tsf.getSimilarSentences(test_sentences) wr = WriteResult() wr.WriteSimilarSentence(result, file=config.SimilarlySentencePath + "rank_simi.txt")
def MyTest(): print "1" filename = config.SimilarlySentencePath + "AllQueriesWithID.txt" sentences = ReadFile.readTXTFile(filename) # sentences = ReadFile.readTXTFile(config.SimilarlySentencePath + "AllQueriesWithID.txt") # test_sentences_doc = ReadFile.readTXTFile(config.SimilarlySentencePath + "corpus_0829.txt") test_sentences = [] # for sen in test_sentences_doc: # sen_iterms = sen.strip().split("\t") # if len(sen_iterms) >= 2: # test_sentences.append(sen_iterms[1]) train_sentences = [] for sen in sentences: sen_iterms = sen.split("\t") # print sen_iterms[1] if len(sen_iterms) >= 2: # print sen_iterms[1].strip().replace(" ","") train_sentences.append(sen_iterms[1].strip().replace(" ", "")) print type(train_sentences[0]) test_sentences = train_sentences tsf = Ranker() tsf.load(config.SimilarlySentencePath+"AllQueriesWithID.txt") result= tsf.getSimilarSentences(test_sentences[:100]) wr = WriteResult() wr.WriteSimilarSentence(result,file=config.SimilarlySentencePath+"rank_simi.txt")
def getFileSentence(): filepath = "../Data/corpus/" files = ['favor0721.xlsx', 'inter0721.xlsx', 'Sentence_QR_pair_0714.xlsx'] sentences = [] for file in files: Q, R = ReadFile.getFileSentence(filepath + file) sentences.extend(Q) return sentences
def getFileSentence(): filepath = "../Data/corpus/" files = ['favor0721.xlsx','inter0721.xlsx','Sentence_QR_pair_0714.xlsx'] sentences = [] for file in files: Q,R = ReadFile.getFileSentence(filepath + file) sentences.extend(Q) return sentences
def getFileContext_Participle(dirPath=MyCode.config.CorpusFilePath): # files = ReadFile.getAllFilesInDir(dirPath) # sentences = ReadFile.getAllFilesContext(files,dirPath) #for test sentences = ReadFile.readTXTFile(dirPath + 'corpus.txt') par_sentences = Participle.Participle(sentences[:10]) par_filter_sentences = filterStopWords.filterStopWords(par_sentences) # return wordTostr(par_filter_sentences) return par_filter_sentences
def main(): sentences = ReadFile.readTXTFile(config.TopicFilePath+"QRpair.txt") qaQueryPairTopic = QAQueryPairTopic() result = qaQueryPairTopic.getgetQAQueriesTopicId(sentences) wr = WriteResult() wr.WriteTopicRegular(result[0]) wr.WriteTopic(result[1]) result = qaQueryPairTopic.getResponsesTopic(sentences) wr.WriteResponseWithTopicId(result)
def getFileSentences(subfilename="souhu_fenci"): sentences = ReadFile.read_souhu_fenci_file(subfilename=subfilename) logging.info("data size :%d" % len(sentences)) print("data size :%d" % len(sentences)) par_sentences = [] for sentence in sentences: snetence_words = sentence.split(" ") par_sentences.append(snetence_words) return par_sentences
def ExtracKeyWordFromSentence(sentencefile =MyCode.config.SentenceKeyWordPath + "badcase.xlsx"): print "Extrc key words from sentence" sentences = ReadFile.getFileSentence(sentencefile)[0] sentences = list(set(sentences)) tag_sentence = get_tagByjieba(sentences) writeToFile(sentences,tag_sentence) sentences_keywords = [] for i in xrange(len(sentences)): sentences_keywords.append((sentences[i],tag_sentence[i])) return sentences_keywords
def insertDicItem(): file = MyCode.config.CaiCaiPath + 'AllQueriesWithID_mid2.txt' fileEnd = MyCode.config.CaiCaiPath + 'AllQueriesWithIDfinished.txt' sentences = ReadFile.readTXTFile(file) with open(fileEnd,'a+') as fp: for sen in sentences: lines = sen.split("\t") lines[2] = lines[2][:-3]+', "client_id": "c_00000007"}' # print lines[2] fp.write(lines[0] +"\t"+lines[1]+'\t'+lines[2]+'\n')
def ExtracKeyWordFromSentence(sentencefile=MyCode.config.SentenceKeyWordPath + "badcase.xlsx"): print "Extrc key words from sentence" sentences = ReadFile.getFileSentence(sentencefile)[0] sentences = list(set(sentences)) tag_sentence = get_tagByjieba(sentences) writeToFile(sentences, tag_sentence) sentences_keywords = [] for i in xrange(len(sentences)): sentences_keywords.append((sentences[i], tag_sentence[i])) return sentences_keywords
def buildWordfromExcel(): file = "../Data/Sentence_QR_pair_0714.xlsx" Q_s, R_s = ReadFile.getFileSentence(file) sentences = Participle.Participle(Q_s) Q_sentences = [] for s in sentences: snetence = '' for word in s: snetence += word + ' ' Q_sentences.append(snetence) return Q_sentences
def buildWordfromExcel(): file = "../Data/Sentence_QR_pair_0714.xlsx" Q_s,R_s = ReadFile.getFileSentence(file) sentences = Participle.Participle(Q_s) Q_sentences = [] for s in sentences: snetence = '' for word in s: snetence += word + ' ' Q_sentences.append(snetence) return Q_sentences
def TagSenetnceTest(self,method=2): tag_sentence = Sentence_Tag() sentences = ReadFile.getQueriesWithId('AllQueriesWithID') re_sentences = tag_sentence.tagSentences(sentences[:100],method=method) for sen in re_sentences: if method == 1: for w in sen: print w[0],w[1] print else: print sen
def TagSenetnceTest(self, method=2): tag_sentence = Sentence_Tag() sentences = ReadFile.getQueriesWithId('AllQueriesWithID') re_sentences = tag_sentence.tagSentences(sentences[:100], method=method) for sen in re_sentences: if method == 1: for w in sen: print w[0], w[1] print else: print sen
def Topic2Vec_v2(): """ 分析句子在,将句子转换为topic 向量 :return: """ lda = LDA() sentences = ReadFile.readTXTFile(config.BTMData + "topic_data_processed.txt") docs = [] lab = [] for index, line in enumerate(sentences): term = line.strip().split("\t") if len(term) != 3: continue docs.append(term[1]) lab.append(term[2]) documents = line_Cut_Word(docs) documents = [" ".join(doc) for doc in documents] lda.load_word_dic() lda.load_LdaModel() # lda.build_word_dic(lines(documents)) # print len(lda.word_dic.keys()) # lda.buildModel(lines(documents)) result_lab = [] topic2vec = [] x_index, y_index = [], [] count = 0 print len(lab) for index, doc_lab in enumerate(list(zip(docs, lab))): if index % 1000 == 0 and index != 0: print doc_lab[0], doc_lab[1] # break doc = doc_lab[0] la = doc_lab[1] topics = lda.getQuerySimilarly(doc) if topics: # print doc, "\t", la for topic in topics: x_index.append(count) y_index.append(topic[0]) topic2vec.append(topic[1]) count += 1 result_lab.append(la) print len(x_index), len(y_index), len(topic2vec), len(result_lab), count result = [x_index, y_index, topic2vec, result_lab] with open(config.BTMData + "topic2vec_2.txt", 'wb') as fp: cPickle.dump(result, fp)
def TestSentencesStruct(self): method = 2 tag_sentence = Sentence_Tag() sentences = ReadFile.getQueriesWithId('AllQueriesWithID') sentenceStruct = SentenceStruct() tag_sentences = tag_sentence.tagSentences(sentences,method) sentences_class_list = sentenceStruct.SimiSentenceStruct(tag_sentences) # sh = Show() # sh.showSenetenceStructResult(sentences_class_list) # wr = WriteResult() # wr.WriteSentenceStruct(sentences_class_list) return sentences_class_list
def TestSentencesStruct(self): method = 2 tag_sentence = Sentence_Tag() sentences = ReadFile.getQueriesWithId('AllQueriesWithID') sentenceStruct = SentenceStruct() tag_sentences = tag_sentence.tagSentences(sentences, method) sentences_class_list = sentenceStruct.SimiSentenceStruct(tag_sentences) # sh = Show() # sh.showSenetenceStructResult(sentences_class_list) # wr = WriteResult() # wr.WriteSentenceStruct(sentences_class_list) return sentences_class_list
def getWords(self): file = config.Semantic_dicPath + "semantic_wordgroup_new.txt" words = {} sentences = ReadFile.readTXTFile(file) for sen in sentences: items = sen.strip().split(' ') if len(items) < 2: continue if words.has_key(items[1]): words[items[1]].append(items[0]) else: words[items[1]] = [items[0]] return words
def TestTopic(self): Ta = TopicAnalysis() sentences = ReadFile.readTXTFile(config.TopicFilePath+"test_topic.txt") for text in sentences: # text = raw_input('query:\n') print "问 :"+text sentence = Sentence() sentence.text = text.strip() response = Ta.getResponse(sentence) if response: print "答 :"+response else: print "没有合适转移话题!"
def buildModel(Name = "wordRank_filter"): file = MyCode.config.ModelPath + Name + '.model' model = None try: model = word2vec.Word2Vec.load(file) except : # word_sentences = getFileContext_Participle() # sentences = wordTostr(word_sentences) # sentences = TextIter()TextIter sentences = ReadFile.readTXTFile(MyCode.config.CorpusFilePath + 'souhu_fenci.txt') model = word2vec.Word2Vec(sentences[:10],min_count=1,workers=8) model.save(file) return model
def filter(file=config.WordDicPath + "semantic_wordgroup_new.txt"): word_sentences = ReadFile.readTXTFile(file) word_dic = {} for word in word_sentences: iterms = word.strip().split(" ") if len(iterms) != 2: continue if iterms[0] in word_dic: if iterms[1] not in word_dic[iterms[0]]: word_dic[iterms[0]].append(iterms[1]) else: word_dic[iterms[0]] = [iterms[1]] return word_dic
def getWords(self): file = config.Semantic_dicPath+"semantic_wordgroup_new.txt" words = {} sentences = ReadFile.readTXTFile(file) for sen in sentences: items = sen.strip().split(' ') if len(items) < 2: continue if words.has_key(items[1]): words[items[1]].append(items[0]) else: words[items[1]] = [items[0]] return words
def buildModel(Name="wordRank_filter"): file = MyCode.config.ModelPath + Name + '.model' model = None try: model = word2vec.Word2Vec.load(file) except: # word_sentences = getFileContext_Participle() # sentences = wordTostr(word_sentences) # sentences = TextIter()TextIter sentences = ReadFile.readTXTFile(MyCode.config.CorpusFilePath + 'souhu_fenci.txt') model = word2vec.Word2Vec(sentences[:10], min_count=1, workers=8) model.save(file) return model
def getQRsentences(file=MyCode.config.CaiCaiPath + "0812caicai.xlsx"): tables = ReadFile.readExcel(file) Q_R_sentences = [] for table in tables: for i in xrange(1,table.nrows): lines = table.row_values(i)[0:] values = [] for va in lines: values.append(va) # print values Q_R_sentences.append(values) # Q_par(Q_R_sentences) print len(Q_R_sentences) return Q_R_sentences
def TestTopic(self): Ta = TopicAnalysis() sentences = ReadFile.readTXTFile(config.TopicFilePath + "test_topic.txt") for text in sentences: # text = raw_input('query:\n') print "问 :" + text sentence = Sentence() sentence.text = text.strip() response = Ta.getResponse(sentence) if response: print "答 :" + response else: print "没有合适转移话题!"
def buildWordFromTxt(): file = "../Result/sentence1.txt" sentences = ReadFile.readTXTFile(file) par_Sentences = [] par_Sentences = Participle.Participle(sentences[10000:11000]) w_Sentence = [] with open("../Result/fenci.txt",'w') as fp: for s in par_Sentences: p_sentence = '' for word in s: p_sentence += word + ' ' w_Sentence.append(p_sentence) # print 'Start writing ... ...' # fp.writelines(w_Sentence) # print 'Finished writing !' return w_Sentence
def main(): model = Doc2VecObj() model.load() sc = SentencesClusters(20,model) filename = config.SimilarlySentencePath + "AllQueriesWithID.txt" sentences = ReadFile.readTXTFile(filename) train_sentences = [] for sen in sentences: sen_iterms = sen.split("\t") # print sen_iterms[1] if len(sen_iterms) >= 2: # print sen_iterms[1].strip().replace(" ","") train_sentences.append(sen_iterms[1].strip().replace(" ", "")) sc.getCluster(train_sentences[:100])
def buildWordFromTxt(): file = "../Result/sentence1.txt" sentences = ReadFile.readTXTFile(file) par_Sentences = [] par_Sentences = Participle.Participle(sentences[10000:11000]) w_Sentence = [] with open("../Result/fenci.txt", 'w') as fp: for s in par_Sentences: p_sentence = '' for word in s: p_sentence += word + ' ' w_Sentence.append(p_sentence) # print 'Start writing ... ...' # fp.writelines(w_Sentence) # print 'Finished writing !' return w_Sentence
def zhwiki_segment(_config, div_size=10, remove_alpha=True): i = 0 files = [] for i in xrange(div_size): files.append( codecs.open(os.path.join( _config.data_path, _config.zhwiki_seg_t2s + str("%02d.txt" % i)), 'w', encoding='utf-8')) print('Start...') stopWords = ReadFile.readStopWord(_config.stopWordPath + 'stop.txt') file_len = 0 with codecs.open(os.path.join(_config.data_path, _config.zhwiki_raw_t2s), 'r', encoding='utf-8') as raw_input: # file_len = raw_input. line = raw_input.readline() while line: line = line.strip() i += 1 if i % 100 == 0: print('line ' + str(i)) # print(line) text = line.split() if True: text = [w for w in text if not is_alpha(w)] word_cut_seed = [jieba.cut(t) for t in text] tmp = '' for sent in word_cut_seed: for tok in sent: if tok in stopWords: continue tmp += tok + ' ' tmp = tmp.strip() if tmp: try: files[i % 10].write(tmp + '\n') except: print("file write error!") continue line = raw_input.readline() for i in xrange(div_size): files[i].close()
def main(): # 老版本 # Lda_model = LdaTopic(modelName='LDAsimiWord_5') # # print Lda_model[0] # lda_model = Lda_model[0] # words = Lda_model[1] # n_top_words = 1 # for i ,topic_dist in enumerate(lda_model.topic_word_): # topic_words = numpy.array(words)[numpy.argsort(topic_dist)][:n_top_words:-1] # print('Topic {}: {}'.format(i, ' '.join(topic_words))) # # Lda_model2 = LdaTopic(modelName='LDAsimiWord_6') # # print Lda_model[0] # lda_model2 = Lda_model2[0] # words2 = Lda_model2[1] # n_top_words2 = 1 # for i, topic_dist in enumerate(lda_model2.topic_word_): # topic_words = numpy.array(words2)[numpy.argsort(topic_dist)][:n_top_words2:-1] # print('Topic {}: {}'.format(i, ' '.join(topic_words))) # documents = getFileSentences() # sentences = [u"你知道周杰伦是谁么?",u"周杰伦是谁?",u"你知道周杰伦吗?",u"你认识周杰伦吗?",u"你认识周杰伦么?",u"你知道周杰伦么?",\ # u"周杰伦知道么?",u"周杰伦知道是谁么?",u"周杰伦知道吗",u"周杰伦知道是谁吗?",u"周杰伦是谁?",u"周杰伦你认识么?",u"周杰伦你知道是谁么?",\ # u"周杰伦你认识吗?",u"周杰伦你知道是谁吗?",u"你认识周杰伦吗?",u"你知道周杰伦吗?",u"你知道周杰伦么?",u"你认识周杰伦么?",u"你认识周杰伦吗?",\ # u"你认识周杰伦是谁么?",u"你认识周杰伦是谁吗?",u"你知道周杰伦吗?",u"你知道周杰伦是谁吗?",u"你知道周杰伦是谁么?"] sentences = ReadFile.getQueriesSentence(config.SimilarlySentencePath + "AllQueriesWithID.txt") queries = getQueries(sentences[:100]) for query in queries: for q in query: print q, print docs_topic = getQueriySimilarly(queries) # for topic in docs_topic: # for re in topic: # print re,1 # print results = groupByTopic(docs_topic, sentences) sh = Show() sh.showDocTopicResult(results) Wr = WriteResult() Wr.WriteTopicResult(results)
def main(): # 老版本 # Lda_model = LdaTopic(modelName='LDAsimiWord_5') # # print Lda_model[0] # lda_model = Lda_model[0] # words = Lda_model[1] # n_top_words = 1 # for i ,topic_dist in enumerate(lda_model.topic_word_): # topic_words = numpy.array(words)[numpy.argsort(topic_dist)][:n_top_words:-1] # print('Topic {}: {}'.format(i, ' '.join(topic_words))) # # Lda_model2 = LdaTopic(modelName='LDAsimiWord_6') # # print Lda_model[0] # lda_model2 = Lda_model2[0] # words2 = Lda_model2[1] # n_top_words2 = 1 # for i, topic_dist in enumerate(lda_model2.topic_word_): # topic_words = numpy.array(words2)[numpy.argsort(topic_dist)][:n_top_words2:-1] # print('Topic {}: {}'.format(i, ' '.join(topic_words))) # documents = getFileSentences() # sentences = [u"你知道周杰伦是谁么?",u"周杰伦是谁?",u"你知道周杰伦吗?",u"你认识周杰伦吗?",u"你认识周杰伦么?",u"你知道周杰伦么?",\ # u"周杰伦知道么?",u"周杰伦知道是谁么?",u"周杰伦知道吗",u"周杰伦知道是谁吗?",u"周杰伦是谁?",u"周杰伦你认识么?",u"周杰伦你知道是谁么?",\ # u"周杰伦你认识吗?",u"周杰伦你知道是谁吗?",u"你认识周杰伦吗?",u"你知道周杰伦吗?",u"你知道周杰伦么?",u"你认识周杰伦么?",u"你认识周杰伦吗?",\ # u"你认识周杰伦是谁么?",u"你认识周杰伦是谁吗?",u"你知道周杰伦吗?",u"你知道周杰伦是谁吗?",u"你知道周杰伦是谁么?"] sentences = ReadFile.getQueriesSentence(config.SimilarlySentencePath + "AllQueriesWithID.txt") queries = getQueries(sentences[:100]) for query in queries: for q in query: print q, print docs_topic = getQueriySimilarly(queries) # for topic in docs_topic: # for re in topic: # print re,1 # print results = groupByTopic(docs_topic,sentences) sh = Show() sh.showDocTopicResult(results) Wr = WriteResult() Wr.WriteTopicResult(results)
def zhwiki_segment(_config,div_size=10 ,remove_alpha=True): i = 0 files = [] for i in xrange(div_size): files.append(codecs.open(os.path.join(_config.data_path, _config.zhwiki_seg_t2s+str("%02d.txt"%i)), 'w', encoding='utf-8')) print('Start...') stopWords = ReadFile.readStopWord(_config.stopWordPath + 'stop.txt') file_len = 0 with codecs.open(os.path.join(_config.data_path, _config.zhwiki_raw_t2s), 'r', encoding='utf-8') as raw_input: # file_len = raw_input. line = raw_input.readline() while line: line = line.strip() i += 1 if i % 100 == 0: print('line ' + str(i)) # print(line) text = line.split() if True: text = [w for w in text if not is_alpha(w)] word_cut_seed = [jieba.cut(t) for t in text] tmp = '' for sent in word_cut_seed: for tok in sent: if tok in stopWords: continue tmp += tok + ' ' tmp = tmp.strip() if tmp: try : files[i%10].write(tmp + '\n') except: print("file write error!") continue line = raw_input.readline() for i in xrange(div_size): files[i].close()
def train_lad(): lda = LDA() sentences = ReadFile.readTXTFile(config.BTMData + "btm_text_corpus.txt") # line = LineSetence(sentences=sentences) lda.buildModel(lines(sentences), num_topics=21)
def main(): # train_sentences = [ # '0无偿居间介绍买卖毒品的行为应如何定性', # '1吸毒男动态持有大量毒品的行为该如何认定', # '2如何区分是非法种植毒品原植物罪还是非法制造毒品罪', # '3为毒贩贩卖毒品提供帮助构成贩卖毒品罪', # '4将自己吸食的毒品原价转让给朋友吸食的行为该如何认定', # '5为获报酬帮人购买毒品的行为该如何认定', # '6毒贩出狱后再次够买毒品途中被抓的行为认定', # '7虚夸毒品功效劝人吸食毒品的行为该如何认定', # '8妻子下落不明丈夫又与他人登记结婚是否为无效婚姻', # '9一方未签字办理的结婚登记是否有效', # '10夫妻双方1990年按农村习俗举办婚礼没有结婚证 一方可否起诉离婚', # '11结婚前对方父母出资购买的住房写我们二人的名字有效吗', # '12身份证被别人冒用无法登记结婚怎么办?', # '13同居后又与他人登记结婚是否构成重婚罪', # '14未办登记只举办结婚仪式可起诉离婚吗', # '15同居多年未办理结婚登记,是否可以向法院起诉要求离婚' # ] filename = config.SimilarlySentencePath+"AllQueriesWithID.txt" sentences = ReadFile.readTXTFile(filename) # sentences = ReadFile.readTXTFile(config.SimilarlySentencePath + "AllQueriesWithID.txt") test_sentences_doc = ReadFile.readTXTFile(config.SimilarlySentencePath + "corpus_0829.txt") test_sentences = [] for sen in test_sentences_doc: sen_iterms = sen.strip().split("\t") if len(sen_iterms) >= 2: test_sentences.append(sen_iterms[1]) train_sentences = [] for sen in sentences: sen_iterms = sen.split("\t") # print sen_iterms[1] if len(sen_iterms) >= 2: # print sen_iterms[1].strip().replace(" ","") train_sentences.append(sen_iterms[1].strip().replace(" ", "")) print type(train_sentences[0]) docs = LabelSentences(filename=None,sentences=train_sentences) # docs = LabelSentences.LabelSentences(sentences=train_sentences) # sentences = ReadFile.readTXTFile(config.SimilarlySentencePath+"corpus_0829.txt") # train_sentences = ReadFile.getFileSentence(config.SimilarlySentencePath + "") # print len(sentences) # train_sentences = [] # for sen in sentences: # sen_iterms = sen.split("\t") # if len(sen_iterms) == 2: # print sen_iterms[1] # train_sentences.append(sen_iterms[1]) # test_sentences = ReadFile.readTXTFile(config.SimilarlySentencePath+"corpus_0829_t.txt") # test_sentences = ['周涛知道是谁吗'] test_sentences = train_sentences[:100] SSO = Doc2VecObj() # corpus = SSO.getCorpus(docs) # SSO.buildModel(docs) # SSO.save() # load model SSO.load() result = SSO.calSentencesSimilarly(test_sentences,train_sentences) Wr = WriteResult() can_not_deal = Wr.WriteSimilarlySentence(result,"Doc2Vec_simi.txt")
def main(): # train_sentences = [ # '0无偿居间介绍买卖毒品的行为应如何定性', # '1吸毒男动态持有大量毒品的行为该如何认定', # '2如何区分是非法种植毒品原植物罪还是非法制造毒品罪', # '3为毒贩贩卖毒品提供帮助构成贩卖毒品罪', # '4将自己吸食的毒品原价转让给朋友吸食的行为该如何认定', # '5为获报酬帮人购买毒品的行为该如何认定', # '6毒贩出狱后再次够买毒品途中被抓的行为认定', # '7虚夸毒品功效劝人吸食毒品的行为该如何认定', # '8妻子下落不明丈夫又与他人登记结婚是否为无效婚姻', # '9一方未签字办理的结婚登记是否有效', # '10夫妻双方1990年按农村习俗举办婚礼没有结婚证 一方可否起诉离婚', # '11结婚前对方父母出资购买的住房写我们二人的名字有效吗', # '12身份证被别人冒用无法登记结婚怎么办?', # '13同居后又与他人登记结婚是否构成重婚罪', # '14未办登记只举办结婚仪式可起诉离婚吗', # '15同居多年未办理结婚登记,是否可以向法院起诉要求离婚' # ] filename = config.SimilarlySentencePath + "AllQueriesWithID.txt" sentences = ReadFile.readTXTFile(filename) # sentences = ReadFile.readTXTFile(config.SimilarlySentencePath + "AllQueriesWithID.txt") test_sentences_doc = ReadFile.readTXTFile(config.SimilarlySentencePath + "corpus_0829.txt") test_sentences = [] for sen in test_sentences_doc: sen_iterms = sen.strip().split("\t") if len(sen_iterms) >= 2: test_sentences.append(sen_iterms[1]) train_sentences = [] for sen in sentences: sen_iterms = sen.split("\t") # print sen_iterms[1] if len(sen_iterms) >= 2: # print sen_iterms[1].strip().replace(" ","") train_sentences.append(sen_iterms[1].strip().replace(" ", "")) print type(train_sentences[0]) docs = LabelSentences(filename=None, sentences=train_sentences) # docs = LabelSentences.LabelSentences(sentences=train_sentences) # sentences = ReadFile.readTXTFile(config.SimilarlySentencePath+"corpus_0829.txt") # train_sentences = ReadFile.getFileSentence(config.SimilarlySentencePath + "") # print len(sentences) # train_sentences = [] # for sen in sentences: # sen_iterms = sen.split("\t") # if len(sen_iterms) == 2: # print sen_iterms[1] # train_sentences.append(sen_iterms[1]) # test_sentences = ReadFile.readTXTFile(config.SimilarlySentencePath+"corpus_0829_t.txt") # test_sentences = ['周涛知道是谁吗'] test_sentences = train_sentences[:100] SSO = Doc2VecObj() # corpus = SSO.getCorpus(docs) # SSO.buildModel(docs) # SSO.save() print " load model" SSO.load() value = SSO.similarly(u"早起吃的油条,很好吃。", u"今天吃什么") # result = SSO.most_similarSentence(test_sentences[9],test_sentences[:200],topn=10) # print test_sentences[9] # for re in result: # print re[0],re[1] print "similarly : ", value
def getQuestionFile(Q_R_sentences): QIdfile = MyCode.config.CaiCaiDataPath + "AllQueriesWithID.txt" QnIdfile = MyCode.config.CaiCaiPath + "AllQueriesWithID.txt" mapQRfile = MyCode.config.CaiCaiDataPath + "AllQueryResponseIdMap.txt" mapnQRfile = MyCode.config.CaiCaiPath + "AllQueryResponseIdMap.txt" RIdfile = MyCode.config.CaiCaiDataPath + "AllResponsesWithID.txt" RnIdfile = MyCode.config.CaiCaiPath + "AllResponsesWithID.txt" Q_sentences = ReadFile.readTXTFile(QIdfile) QR_map = [] OldMapQid = {} MapRId = {} MapQid = {} r_id = 1 q_id = 1 OldRIdSentences = ReadFile.readTXTFile(RIdfile) R_sens = [] for line in OldRIdSentences: R_sens.append(line.strip().split("\t")[1]) R_sens = list(set(R_sens)) for qr_sentence in Q_R_sentences: csentence = [] exist = False for sentence in Q_sentences: csentence = sentence.strip().split('\t') line = csentence[1].replace(' ','') if line == qr_sentence[0]: exist = True break if exist: sq_id = csentence[0] OldMapQid.setdefault(sq_id,'') else: sq_id = "CAICAI_Q_"+str(time.strftime("%Y%m%d%H%M", time.localtime()))+"%05d"%q_id q_id += 1 MapQid.setdefault(sq_id,qr_sentence[1].replace(' ','')) for i in xrange(2, 5): if qr_sentence[i] in R_sens: continue if qr_sentence[i] != '' and len(qr_sentence[i]) > 2: print qr_sentence[i] sr_id = 'CAICAI_R_'+str(time.strftime("%Y%m%d%H%M", time.localtime()))+'%05d' % r_id QR_map.append((sq_id, sr_id)) MapRId.setdefault(sr_id,qr_sentence[i]) r_id += 1 fileEnd = MyCode.config.CaiCaiPath + 'AllQueriesWithIDfinished.txt' # 重写Questions 文件 with open(fileEnd,'w') as fp: # print len(OldMapQid.keys()) for sen in Q_sentences: lines = sen.split('\t')[0] if OldMapQid.has_key(lines): fp.write(sen[:-2]+',"client_id": "c_00000007"}\n') else: fp.write(sen) # 结果写入文件 with open(QnIdfile,'w') as fp: MapQid = sorted(MapQid.iteritems(),key=lambda asd:asd[0],reverse=False) for id in MapQid: fp.write(id[0]+'\t'+id[1]+"\n") with open(mapnQRfile,'w') as fp: sen = ReadFile.readTXTFile(mapQRfile) for s in sen: lines = s.split('\t') print lines QR_map.append((lines[0],lines[1][:-1])) QR_map = list(set(QR_map)) for qr in sorted(QR_map,key=lambda asd:asd[0],reverse=False): fp.write(qr[0]+'\t'+qr[1]+'\n') with open(RnIdfile,'w') as fp: MapRId = sorted(MapRId.iteritems(),key=lambda asd:asd[0],reverse=False) for id in MapRId: fp.write(id[0]+'\t'+id[1].strip()+'\t{"client_id": "c_00000007"}\n')