def main(): sentences = ReadFile.readTXTFile(config.SimilarlySentencePath + "AllQueriesWithID.txt") test_sentences_doc = ReadFile.readTXTFile(config.SimilarlySentencePath + "corpus_0829.txt") test_sentences = [] for sen in test_sentences_doc: sen_iterms = sen.strip().split("\t") if len(sen_iterms) >= 2: test_sentences.append(sen_iterms[1]) # train_sentences = [ # '0无偿居间介绍买卖毒品的行为应如何定性', # '1吸毒男动态持有大量毒品的行为该如何认定', # '2如何区分是非法种植毒品原植物罪还是非法制造毒品罪', # '3为毒贩贩卖毒品提供帮助构成贩卖毒品罪', # '4将自己吸食的毒品原价转让给朋友吸食的行为该如何认定', # '5为获报酬帮人购买毒品的行为该如何认定', # '6毒贩出狱后再次够买毒品途中被抓的行为认定', # '7虚夸毒品功效劝人吸食毒品的行为该如何认定', # '8妻子下落不明丈夫又与他人登记结婚是否为无效婚姻', # '9一方未签字办理的结婚登记是否有效', # '10夫妻双方1990年按农村习俗举办婚礼没有结婚证 一方可否起诉离婚', # '11结婚前对方父母出资购买的住房写我们二人的名字有效吗', # '12身份证被别人冒用无法登记结婚怎么办?', # '13同居后又与他人登记结婚是否构成重婚罪', # '14未办登记只举办结婚仪式可起诉离婚吗', # '15同居多年未办理结婚登记,是否可以向法院起诉要求离婚' # ] # print type(train_sentences[0]) # print len(sentences) train_sentences = [] for sen in sentences: sen_iterms = sen.split("\t") # print sen_iterms[1] if len(sen_iterms) >= 2: # print sen_iterms[1].strip().replace(" ","") train_sentences.append(sen_iterms[1].strip().replace(" ","")) print type(train_sentences[0]) # # print "build simi_model" SSO = SentenceSimilarlyObj() # corpus = SSO.getCorpus(train_sentences) # SSO.setSimilar(corpus=corpus) # print "save simi model" # SSO.save() # SSO.save("simi_model_little","word_dic_little") # print "build success" print "load model" SSO.load() # print SSO.similar print "test" # indexs = SSO.calSentenceSimilarly(sentence=u"说说后天是礼拜几") # for index in indexs: # print index[0],train_sentences[index[0]],index[1] result = SSO.calSentencesSimilarly(train_sentences,train_sentences) Wr = WriteResult() can_not_deal = Wr.WriteSimilarlySentence(result,"docSim_simi.txt")
def main(): sentences = ReadFile.readTXTFile(config.SimilarlySentencePath + "AllQueriesWithID.txt") test_sentences_doc = ReadFile.readTXTFile(config.SimilarlySentencePath + "corpus_0829.txt") test_sentences = [] for sen in test_sentences_doc: sen_iterms = sen.strip().split("\t") if len(sen_iterms) >= 2: test_sentences.append(sen_iterms[1]) # train_sentences = [ # '0无偿居间介绍买卖毒品的行为应如何定性', # '1吸毒男动态持有大量毒品的行为该如何认定', # '2如何区分是非法种植毒品原植物罪还是非法制造毒品罪', # '3为毒贩贩卖毒品提供帮助构成贩卖毒品罪', # '4将自己吸食的毒品原价转让给朋友吸食的行为该如何认定', # '5为获报酬帮人购买毒品的行为该如何认定', # '6毒贩出狱后再次够买毒品途中被抓的行为认定', # '7虚夸毒品功效劝人吸食毒品的行为该如何认定', # '8妻子下落不明丈夫又与他人登记结婚是否为无效婚姻', # '9一方未签字办理的结婚登记是否有效', # '10夫妻双方1990年按农村习俗举办婚礼没有结婚证 一方可否起诉离婚', # '11结婚前对方父母出资购买的住房写我们二人的名字有效吗', # '12身份证被别人冒用无法登记结婚怎么办?', # '13同居后又与他人登记结婚是否构成重婚罪', # '14未办登记只举办结婚仪式可起诉离婚吗', # '15同居多年未办理结婚登记,是否可以向法院起诉要求离婚' # ] # print type(train_sentences[0]) # print len(sentences) train_sentences = [] for sen in sentences: sen_iterms = sen.split("\t") # print sen_iterms[1] if len(sen_iterms) >= 2: # print sen_iterms[1].strip().replace(" ","") train_sentences.append(sen_iterms[1].strip().replace(" ", "")) print type(train_sentences[0]) # # print "build simi_model" SSO = SentenceSimilarlyObj() corpus = SSO.getCorpus(train_sentences) SSO.setSimilar(corpus=corpus) print "save simi model" # SSO.save() # SSO.save("simi_model_little","word_dic_little") # print "build success" print "load model" SSO.load() # print SSO.similar print "test"
def getNewWords(self): file = config.WordDicPath + "birds.txt" lines = ReadFile.readTXTFile(file) words = [] for line in lines: words.extend(line.strip().split(" ")) return words
def MyTest(): print "1" filename = config.SimilarlySentencePath + "AllQueriesWithID.txt" sentences = ReadFile.readTXTFile(filename) # sentences = ReadFile.readTXTFile(config.SimilarlySentencePath + "AllQueriesWithID.txt") # test_sentences_doc = ReadFile.readTXTFile(config.SimilarlySentencePath + "corpus_0829.txt") test_sentences = [] # for sen in test_sentences_doc: # sen_iterms = sen.strip().split("\t") # if len(sen_iterms) >= 2: # test_sentences.append(sen_iterms[1]) train_sentences = [] for sen in sentences: sen_iterms = sen.split("\t") # print sen_iterms[1] if len(sen_iterms) >= 2: # print sen_iterms[1].strip().replace(" ","") train_sentences.append(sen_iterms[1].strip().replace(" ", "")) print type(train_sentences[0]) test_sentences = train_sentences tsf = Ranker() tsf.load(config.SimilarlySentencePath+"AllQueriesWithID.txt") result= tsf.getSimilarSentences(test_sentences[:100]) wr = WriteResult() wr.WriteSimilarSentence(result,file=config.SimilarlySentencePath+"rank_simi.txt")
def MyTest(): print "1" filename = config.SimilarlySentencePath + "AllQueriesWithID.txt" sentences = ReadFile.readTXTFile(filename) # sentences = ReadFile.readTXTFile(config.SimilarlySentencePath + "AllQueriesWithID.txt") # test_sentences_doc = ReadFile.readTXTFile(config.SimilarlySentencePath + "corpus_0829.txt") test_sentences = [] # for sen in test_sentences_doc: # sen_iterms = sen.strip().split("\t") # if len(sen_iterms) >= 2: # test_sentences.append(sen_iterms[1]) train_sentences = [] for sen in sentences: sen_iterms = sen.split("\t") # print sen_iterms[1] if len(sen_iterms) >= 2: # print sen_iterms[1].strip().replace(" ","") train_sentences.append(sen_iterms[1].strip().replace(" ", "")) print type(train_sentences[0]) test_sentences = train_sentences tsf = Ranker() tsf.load(config.SimilarlySentencePath + "AllQueriesWithID.txt") result = tsf.getSimilarSentences(test_sentences) wr = WriteResult() wr.WriteSimilarSentence(result, file=config.SimilarlySentencePath + "rank_simi.txt")
def getFileContext_Participle(dirPath=MyCode.config.CorpusFilePath): # files = ReadFile.getAllFilesInDir(dirPath) # sentences = ReadFile.getAllFilesContext(files,dirPath) #for test sentences = ReadFile.readTXTFile(dirPath + 'corpus.txt') par_sentences = Participle.Participle(sentences[:10]) par_filter_sentences = filterStopWords.filterStopWords(par_sentences) # return wordTostr(par_filter_sentences) return par_filter_sentences
def main(): sentences = ReadFile.readTXTFile(config.TopicFilePath+"QRpair.txt") qaQueryPairTopic = QAQueryPairTopic() result = qaQueryPairTopic.getgetQAQueriesTopicId(sentences) wr = WriteResult() wr.WriteTopicRegular(result[0]) wr.WriteTopic(result[1]) result = qaQueryPairTopic.getResponsesTopic(sentences) wr.WriteResponseWithTopicId(result)
def insertDicItem(): file = MyCode.config.CaiCaiPath + 'AllQueriesWithID_mid2.txt' fileEnd = MyCode.config.CaiCaiPath + 'AllQueriesWithIDfinished.txt' sentences = ReadFile.readTXTFile(file) with open(fileEnd,'a+') as fp: for sen in sentences: lines = sen.split("\t") lines[2] = lines[2][:-3]+', "client_id": "c_00000007"}' # print lines[2] fp.write(lines[0] +"\t"+lines[1]+'\t'+lines[2]+'\n')
def Topic2Vec_v2(): """ 分析句子在,将句子转换为topic 向量 :return: """ lda = LDA() sentences = ReadFile.readTXTFile(config.BTMData + "topic_data_processed.txt") docs = [] lab = [] for index, line in enumerate(sentences): term = line.strip().split("\t") if len(term) != 3: continue docs.append(term[1]) lab.append(term[2]) documents = line_Cut_Word(docs) documents = [" ".join(doc) for doc in documents] lda.load_word_dic() lda.load_LdaModel() # lda.build_word_dic(lines(documents)) # print len(lda.word_dic.keys()) # lda.buildModel(lines(documents)) result_lab = [] topic2vec = [] x_index, y_index = [], [] count = 0 print len(lab) for index, doc_lab in enumerate(list(zip(docs, lab))): if index % 1000 == 0 and index != 0: print doc_lab[0], doc_lab[1] # break doc = doc_lab[0] la = doc_lab[1] topics = lda.getQuerySimilarly(doc) if topics: # print doc, "\t", la for topic in topics: x_index.append(count) y_index.append(topic[0]) topic2vec.append(topic[1]) count += 1 result_lab.append(la) print len(x_index), len(y_index), len(topic2vec), len(result_lab), count result = [x_index, y_index, topic2vec, result_lab] with open(config.BTMData + "topic2vec_2.txt", 'wb') as fp: cPickle.dump(result, fp)
def TestTopic(self): Ta = TopicAnalysis() sentences = ReadFile.readTXTFile(config.TopicFilePath+"test_topic.txt") for text in sentences: # text = raw_input('query:\n') print "问 :"+text sentence = Sentence() sentence.text = text.strip() response = Ta.getResponse(sentence) if response: print "答 :"+response else: print "没有合适转移话题!"
def buildModel(Name = "wordRank_filter"): file = MyCode.config.ModelPath + Name + '.model' model = None try: model = word2vec.Word2Vec.load(file) except : # word_sentences = getFileContext_Participle() # sentences = wordTostr(word_sentences) # sentences = TextIter()TextIter sentences = ReadFile.readTXTFile(MyCode.config.CorpusFilePath + 'souhu_fenci.txt') model = word2vec.Word2Vec(sentences[:10],min_count=1,workers=8) model.save(file) return model
def getWords(self): file = config.Semantic_dicPath + "semantic_wordgroup_new.txt" words = {} sentences = ReadFile.readTXTFile(file) for sen in sentences: items = sen.strip().split(' ') if len(items) < 2: continue if words.has_key(items[1]): words[items[1]].append(items[0]) else: words[items[1]] = [items[0]] return words
def getWords(self): file = config.Semantic_dicPath+"semantic_wordgroup_new.txt" words = {} sentences = ReadFile.readTXTFile(file) for sen in sentences: items = sen.strip().split(' ') if len(items) < 2: continue if words.has_key(items[1]): words[items[1]].append(items[0]) else: words[items[1]] = [items[0]] return words
def filter(file=config.WordDicPath + "semantic_wordgroup_new.txt"): word_sentences = ReadFile.readTXTFile(file) word_dic = {} for word in word_sentences: iterms = word.strip().split(" ") if len(iterms) != 2: continue if iterms[0] in word_dic: if iterms[1] not in word_dic[iterms[0]]: word_dic[iterms[0]].append(iterms[1]) else: word_dic[iterms[0]] = [iterms[1]] return word_dic
def TestTopic(self): Ta = TopicAnalysis() sentences = ReadFile.readTXTFile(config.TopicFilePath + "test_topic.txt") for text in sentences: # text = raw_input('query:\n') print "问 :" + text sentence = Sentence() sentence.text = text.strip() response = Ta.getResponse(sentence) if response: print "答 :" + response else: print "没有合适转移话题!"
def buildModel(Name="wordRank_filter"): file = MyCode.config.ModelPath + Name + '.model' model = None try: model = word2vec.Word2Vec.load(file) except: # word_sentences = getFileContext_Participle() # sentences = wordTostr(word_sentences) # sentences = TextIter()TextIter sentences = ReadFile.readTXTFile(MyCode.config.CorpusFilePath + 'souhu_fenci.txt') model = word2vec.Word2Vec(sentences[:10], min_count=1, workers=8) model.save(file) return model
def buildWordFromTxt(): file = "../Result/sentence1.txt" sentences = ReadFile.readTXTFile(file) par_Sentences = [] par_Sentences = Participle.Participle(sentences[10000:11000]) w_Sentence = [] with open("../Result/fenci.txt", 'w') as fp: for s in par_Sentences: p_sentence = '' for word in s: p_sentence += word + ' ' w_Sentence.append(p_sentence) # print 'Start writing ... ...' # fp.writelines(w_Sentence) # print 'Finished writing !' return w_Sentence
def main(): model = Doc2VecObj() model.load() sc = SentencesClusters(20,model) filename = config.SimilarlySentencePath + "AllQueriesWithID.txt" sentences = ReadFile.readTXTFile(filename) train_sentences = [] for sen in sentences: sen_iterms = sen.split("\t") # print sen_iterms[1] if len(sen_iterms) >= 2: # print sen_iterms[1].strip().replace(" ","") train_sentences.append(sen_iterms[1].strip().replace(" ", "")) sc.getCluster(train_sentences[:100])
def buildWordFromTxt(): file = "../Result/sentence1.txt" sentences = ReadFile.readTXTFile(file) par_Sentences = [] par_Sentences = Participle.Participle(sentences[10000:11000]) w_Sentence = [] with open("../Result/fenci.txt",'w') as fp: for s in par_Sentences: p_sentence = '' for word in s: p_sentence += word + ' ' w_Sentence.append(p_sentence) # print 'Start writing ... ...' # fp.writelines(w_Sentence) # print 'Finished writing !' return w_Sentence
def main(): # 老版本 # Lda_model = LdaTopic(modelName='LDAsimiWord_5') # # print Lda_model[0] # lda_model = Lda_model[0] # words = Lda_model[1] # n_top_words = 1 # for i ,topic_dist in enumerate(lda_model.topic_word_): # topic_words = numpy.array(words)[numpy.argsort(topic_dist)][:n_top_words:-1] # print('Topic {}: {}'.format(i, ' '.join(topic_words))) # # Lda_model2 = LdaTopic(modelName='LDAsimiWord_6') # # print Lda_model[0] # lda_model2 = Lda_model2[0] # words2 = Lda_model2[1] # n_top_words2 = 1 # for i, topic_dist in enumerate(lda_model2.topic_word_): # topic_words = numpy.array(words2)[numpy.argsort(topic_dist)][:n_top_words2:-1] # print('Topic {}: {}'.format(i, ' '.join(topic_words))) # documents = getFileSentences() # sentences = [u"你知道周杰伦是谁么?",u"周杰伦是谁?",u"你知道周杰伦吗?",u"你认识周杰伦吗?",u"你认识周杰伦么?",u"你知道周杰伦么?",\ # u"周杰伦知道么?",u"周杰伦知道是谁么?",u"周杰伦知道吗",u"周杰伦知道是谁吗?",u"周杰伦是谁?",u"周杰伦你认识么?",u"周杰伦你知道是谁么?",\ # u"周杰伦你认识吗?",u"周杰伦你知道是谁吗?",u"你认识周杰伦吗?",u"你知道周杰伦吗?",u"你知道周杰伦么?",u"你认识周杰伦么?",u"你认识周杰伦吗?",\ # u"你认识周杰伦是谁么?",u"你认识周杰伦是谁吗?",u"你知道周杰伦吗?",u"你知道周杰伦是谁吗?",u"你知道周杰伦是谁么?"] sentences = ReadFile.readTXTFile("./BitermTopicModel/document_corpus.txt") queries = getQueries(sentences[:100]) for query in queries: for q in query: print q, print docs_topic = getQueriySimilarly(queries) # for topic in docs_topic: # for re in topic: # print re,1 # print results = groupByTopic(docs_topic,sentences) sh = Show() sh.showDocTopicResult(results) Wr = WriteResult() Wr.WriteTopicResult(results)
def main(): # 老版本 # Lda_model = LdaTopic(modelName='LDAsimiWord_5') # # print Lda_model[0] # lda_model = Lda_model[0] # words = Lda_model[1] # n_top_words = 1 # for i ,topic_dist in enumerate(lda_model.topic_word_): # topic_words = numpy.array(words)[numpy.argsort(topic_dist)][:n_top_words:-1] # print('Topic {}: {}'.format(i, ' '.join(topic_words))) # # Lda_model2 = LdaTopic(modelName='LDAsimiWord_6') # # print Lda_model[0] # lda_model2 = Lda_model2[0] # words2 = Lda_model2[1] # n_top_words2 = 1 # for i, topic_dist in enumerate(lda_model2.topic_word_): # topic_words = numpy.array(words2)[numpy.argsort(topic_dist)][:n_top_words2:-1] # print('Topic {}: {}'.format(i, ' '.join(topic_words))) # documents = getFileSentences() # sentences = [u"你知道周杰伦是谁么?",u"周杰伦是谁?",u"你知道周杰伦吗?",u"你认识周杰伦吗?",u"你认识周杰伦么?",u"你知道周杰伦么?",\ # u"周杰伦知道么?",u"周杰伦知道是谁么?",u"周杰伦知道吗",u"周杰伦知道是谁吗?",u"周杰伦是谁?",u"周杰伦你认识么?",u"周杰伦你知道是谁么?",\ # u"周杰伦你认识吗?",u"周杰伦你知道是谁吗?",u"你认识周杰伦吗?",u"你知道周杰伦吗?",u"你知道周杰伦么?",u"你认识周杰伦么?",u"你认识周杰伦吗?",\ # u"你认识周杰伦是谁么?",u"你认识周杰伦是谁吗?",u"你知道周杰伦吗?",u"你知道周杰伦是谁吗?",u"你知道周杰伦是谁么?"] sentences = ReadFile.readTXTFile("./BitermTopicModel/document_corpus.txt") queries = getQueries(sentences[:100]) for query in queries: for q in query: print q, print docs_topic = getQueriySimilarly(queries) # for topic in docs_topic: # for re in topic: # print re,1 # print results = groupByTopic(docs_topic, sentences) sh = Show() sh.showDocTopicResult(results) Wr = WriteResult() Wr.WriteTopicResult(results)
def main(): # train_sentences = [ # '0无偿居间介绍买卖毒品的行为应如何定性', # '1吸毒男动态持有大量毒品的行为该如何认定', # '2如何区分是非法种植毒品原植物罪还是非法制造毒品罪', # '3为毒贩贩卖毒品提供帮助构成贩卖毒品罪', # '4将自己吸食的毒品原价转让给朋友吸食的行为该如何认定', # '5为获报酬帮人购买毒品的行为该如何认定', # '6毒贩出狱后再次够买毒品途中被抓的行为认定', # '7虚夸毒品功效劝人吸食毒品的行为该如何认定', # '8妻子下落不明丈夫又与他人登记结婚是否为无效婚姻', # '9一方未签字办理的结婚登记是否有效', # '10夫妻双方1990年按农村习俗举办婚礼没有结婚证 一方可否起诉离婚', # '11结婚前对方父母出资购买的住房写我们二人的名字有效吗', # '12身份证被别人冒用无法登记结婚怎么办?', # '13同居后又与他人登记结婚是否构成重婚罪', # '14未办登记只举办结婚仪式可起诉离婚吗', # '15同居多年未办理结婚登记,是否可以向法院起诉要求离婚' # ] filename = config.SimilarlySentencePath+"AllQueriesWithID.txt" sentences = ReadFile.readTXTFile(filename) # sentences = ReadFile.readTXTFile(config.SimilarlySentencePath + "AllQueriesWithID.txt") test_sentences_doc = ReadFile.readTXTFile(config.SimilarlySentencePath + "corpus_0829.txt") test_sentences = [] for sen in test_sentences_doc: sen_iterms = sen.strip().split("\t") if len(sen_iterms) >= 2: test_sentences.append(sen_iterms[1]) train_sentences = [] for sen in sentences: sen_iterms = sen.split("\t") # print sen_iterms[1] if len(sen_iterms) >= 2: # print sen_iterms[1].strip().replace(" ","") train_sentences.append(sen_iterms[1].strip().replace(" ", "")) print type(train_sentences[0]) docs = LabelSentences(filename=None,sentences=train_sentences) # docs = LabelSentences.LabelSentences(sentences=train_sentences) # sentences = ReadFile.readTXTFile(config.SimilarlySentencePath+"corpus_0829.txt") # train_sentences = ReadFile.getFileSentence(config.SimilarlySentencePath + "") # print len(sentences) # train_sentences = [] # for sen in sentences: # sen_iterms = sen.split("\t") # if len(sen_iterms) == 2: # print sen_iterms[1] # train_sentences.append(sen_iterms[1]) # test_sentences = ReadFile.readTXTFile(config.SimilarlySentencePath+"corpus_0829_t.txt") # test_sentences = ['周涛知道是谁吗'] test_sentences = train_sentences[:100] SSO = Doc2VecObj() # corpus = SSO.getCorpus(docs) # SSO.buildModel(docs) # SSO.save() # load model SSO.load() result = SSO.calSentencesSimilarly(test_sentences,train_sentences) Wr = WriteResult() can_not_deal = Wr.WriteSimilarlySentence(result,"Doc2Vec_simi.txt")
def train_lad(): lda = LDA() sentences = ReadFile.readTXTFile(config.BTMData + "btm_text_corpus.txt") # line = LineSetence(sentences=sentences) lda.buildModel(lines(sentences), num_topics=21)
def getQuestionFile(Q_R_sentences): QIdfile = MyCode.config.CaiCaiDataPath + "AllQueriesWithID.txt" QnIdfile = MyCode.config.CaiCaiPath + "AllQueriesWithID.txt" mapQRfile = MyCode.config.CaiCaiDataPath + "AllQueryResponseIdMap.txt" mapnQRfile = MyCode.config.CaiCaiPath + "AllQueryResponseIdMap.txt" RIdfile = MyCode.config.CaiCaiDataPath + "AllResponsesWithID.txt" RnIdfile = MyCode.config.CaiCaiPath + "AllResponsesWithID.txt" Q_sentences = ReadFile.readTXTFile(QIdfile) QR_map = [] OldMapQid = {} MapRId = {} MapQid = {} r_id = 1 q_id = 1 OldRIdSentences = ReadFile.readTXTFile(RIdfile) R_sens = [] for line in OldRIdSentences: R_sens.append(line.strip().split("\t")[1]) R_sens = list(set(R_sens)) for qr_sentence in Q_R_sentences: csentence = [] exist = False for sentence in Q_sentences: csentence = sentence.strip().split('\t') line = csentence[1].replace(' ','') if line == qr_sentence[0]: exist = True break if exist: sq_id = csentence[0] OldMapQid.setdefault(sq_id,'') else: sq_id = "CAICAI_Q_"+str(time.strftime("%Y%m%d%H%M", time.localtime()))+"%05d"%q_id q_id += 1 MapQid.setdefault(sq_id,qr_sentence[1].replace(' ','')) for i in xrange(2, 5): if qr_sentence[i] in R_sens: continue if qr_sentence[i] != '' and len(qr_sentence[i]) > 2: print qr_sentence[i] sr_id = 'CAICAI_R_'+str(time.strftime("%Y%m%d%H%M", time.localtime()))+'%05d' % r_id QR_map.append((sq_id, sr_id)) MapRId.setdefault(sr_id,qr_sentence[i]) r_id += 1 fileEnd = MyCode.config.CaiCaiPath + 'AllQueriesWithIDfinished.txt' # 重写Questions 文件 with open(fileEnd,'w') as fp: # print len(OldMapQid.keys()) for sen in Q_sentences: lines = sen.split('\t')[0] if OldMapQid.has_key(lines): fp.write(sen[:-2]+',"client_id": "c_00000007"}\n') else: fp.write(sen) # 结果写入文件 with open(QnIdfile,'w') as fp: MapQid = sorted(MapQid.iteritems(),key=lambda asd:asd[0],reverse=False) for id in MapQid: fp.write(id[0]+'\t'+id[1]+"\n") with open(mapnQRfile,'w') as fp: sen = ReadFile.readTXTFile(mapQRfile) for s in sen: lines = s.split('\t') print lines QR_map.append((lines[0],lines[1][:-1])) QR_map = list(set(QR_map)) for qr in sorted(QR_map,key=lambda asd:asd[0],reverse=False): fp.write(qr[0]+'\t'+qr[1]+'\n') with open(RnIdfile,'w') as fp: MapRId = sorted(MapRId.iteritems(),key=lambda asd:asd[0],reverse=False) for id in MapRId: fp.write(id[0]+'\t'+id[1].strip()+'\t{"client_id": "c_00000007"}\n')
def main(): # train_sentences = [ # '0无偿居间介绍买卖毒品的行为应如何定性', # '1吸毒男动态持有大量毒品的行为该如何认定', # '2如何区分是非法种植毒品原植物罪还是非法制造毒品罪', # '3为毒贩贩卖毒品提供帮助构成贩卖毒品罪', # '4将自己吸食的毒品原价转让给朋友吸食的行为该如何认定', # '5为获报酬帮人购买毒品的行为该如何认定', # '6毒贩出狱后再次够买毒品途中被抓的行为认定', # '7虚夸毒品功效劝人吸食毒品的行为该如何认定', # '8妻子下落不明丈夫又与他人登记结婚是否为无效婚姻', # '9一方未签字办理的结婚登记是否有效', # '10夫妻双方1990年按农村习俗举办婚礼没有结婚证 一方可否起诉离婚', # '11结婚前对方父母出资购买的住房写我们二人的名字有效吗', # '12身份证被别人冒用无法登记结婚怎么办?', # '13同居后又与他人登记结婚是否构成重婚罪', # '14未办登记只举办结婚仪式可起诉离婚吗', # '15同居多年未办理结婚登记,是否可以向法院起诉要求离婚' # ] filename = config.SimilarlySentencePath + "AllQueriesWithID.txt" sentences = ReadFile.readTXTFile(filename) # sentences = ReadFile.readTXTFile(config.SimilarlySentencePath + "AllQueriesWithID.txt") test_sentences_doc = ReadFile.readTXTFile(config.SimilarlySentencePath + "corpus_0829.txt") test_sentences = [] for sen in test_sentences_doc: sen_iterms = sen.strip().split("\t") if len(sen_iterms) >= 2: test_sentences.append(sen_iterms[1]) train_sentences = [] for sen in sentences: sen_iterms = sen.split("\t") # print sen_iterms[1] if len(sen_iterms) >= 2: # print sen_iterms[1].strip().replace(" ","") train_sentences.append(sen_iterms[1].strip().replace(" ", "")) print type(train_sentences[0]) docs = LabelSentences(filename=None, sentences=train_sentences) # docs = LabelSentences.LabelSentences(sentences=train_sentences) # sentences = ReadFile.readTXTFile(config.SimilarlySentencePath+"corpus_0829.txt") # train_sentences = ReadFile.getFileSentence(config.SimilarlySentencePath + "") # print len(sentences) # train_sentences = [] # for sen in sentences: # sen_iterms = sen.split("\t") # if len(sen_iterms) == 2: # print sen_iterms[1] # train_sentences.append(sen_iterms[1]) # test_sentences = ReadFile.readTXTFile(config.SimilarlySentencePath+"corpus_0829_t.txt") # test_sentences = ['周涛知道是谁吗'] test_sentences = train_sentences[:100] SSO = Doc2VecObj() # corpus = SSO.getCorpus(docs) # SSO.buildModel(docs) # SSO.save() print " load model" SSO.load() value = SSO.similarly(u"早起吃的油条,很好吃。", u"今天吃什么") # result = SSO.most_similarSentence(test_sentences[9],test_sentences[:200],topn=10) # print test_sentences[9] # for re in result: # print re[0],re[1] print "similarly : ", value