def notes_classify_lsi(self, notes_path): note_list = loadLine(notes_path) print('note_item', note_list[0], len(note_list)) result_list = [] for note_item in note_list: in_json = json.loads(str(note_item)) # Encode the data # print('note_item', in_json['remarks']) remark = in_json['remarks'] # 1.分词 note_corpus = self.parser.segmentor.segment(remark) # 2.转换成bow向量 note_doc2bow = self.dictionary.doc2bow(note_corpus) # 3.计算tfidf值 note_tfidf = self.tfidf_model[note_doc2bow] # 更新LSI的值 # self.lsi.add_documents(note_tfidf) # 4.计算lsi值 note_lsi = self.lsi[note_tfidf] if note_lsi: # 返回最相似的样本材料,(index_of_document, similarity) tuples print('111111 type inside ', type(note_lsi[0]), note_lsi) result_list.append({ 'remarks': remark, 'position': note_lsi[0] }) return result_list
def notes_classify_tfidf(self, notes_path): note_list = loadLine(notes_path) self.similarity.num_best = 5 print('note_item', note_list[0], len(note_list)) result_list = [] for note_item in note_list: in_json = json.loads(str(note_item)) # Encode the data # print('note_item', in_json['remarks']) remark = in_json['remarks'] note_corpus = self.parser.segmentor.segment(remark) # 生成note的词袋 note_doc2bow = self.dictionary.doc2bow(note_corpus) # 根据之前训练生成的model,生成query的IFIDF值,然后进行相似度计算 note_tfidf = self.tfidf_model[note_doc2bow] # 获取相似度结果 note_similarity = self.similarity[note_tfidf] if note_similarity: # 返回最相似的样本材料,(index_of_document, similarity) tuples print('111111 type inside ', type(note_similarity[0]), note_similarity) result_list.append({ 'remarks': remark, 'position': note_similarity[0] }) return result_list
def remove_duplicates(infile, inname, outfile): outopen = open(outfile, 'a', encoding='utf-8') infile_list = loadLine(infile) names = inname.split('/')[-1].split("-") print('names', names) list_out = [] for line in infile_list: if line not in list_out: list_out.append(line) outopen.write(line.strip() + '\t' + names[0] + '\n') outopen.close()
def word_segmentation(self, input_path, out_path): sentence_list = loadLine(input_path) outOpen = open(out_path, 'w', encoding='utf-8') for sentence_item in sentence_list: in_json = json.loads(sentence_item) # Encode the data remark = in_json['remarks'] # remark = sentence_item print('sentence_item', remark) result_list = self.parser.segmentor.segment(remark) outOpen.write(', '.join(result_list) + '\n') outOpen.close()
def save_no_index_tag(): list_remark = loadLine('../../res/foo.txt') fp = open('../../assert/triple-extractor.txt', "w", encoding='utf-8', errors='ignore') for item_data in list_remark: print('item_data', item_data) in_json = json.loads(item_data) # Encode the data remarks = in_json['remarks'] if in_json['remarks'] else " " custom_state = in_json['custom_state'] if in_json['custom_state'] else "\t\t" if remarks != '': sentences = split_sents(remarks) temp_content = [] for sentence in sentences: words, postags, child_dict_list, roles_dict, format_parse_list, parse_child_dict = parse.parser_main( sentence) for temp_item in format_parse_list: if 'HED' in temp_item[0]: temp_content.append(parse_child_dict[temp_item[2]]) fp.write(custom_state + '-----' + str(words) +'++++++++'+temp_item[1]+'\t'+str(temp_content[0])+ '\n') else: fp.write(custom_state + '\t' + '' + '\n\n\n') fp.close()
def word_property(self, file_path, save_path_property): result_list = loadLine(file_path) for content_item in result_list: in_json = json.loads(str(content_item)) # Encode the data # print('note_item', in_json['remarks']) remark = in_json['remarks'] # 获取分词 words = self.parser.segmentor.segment(remark) # print('words', words) # 词性标注 postags = self.parser.postagger.postag(words) # print('note_item', postags) # print('postags', postags) result_list = [] for i in range(len(postags)): save_path_temp = save_path_property + postags[i] fp = open(save_path_temp, "a", encoding='utf-8', errors='ignore') fp.write(words[i] + '\t') fp.close()
for token in layer: print token, def __printTree(self, node, layer, layers): layer += 1 if (layer not in layers and node.hasChildren()): layerTitle = "layer", str(layer) + ":\n" layers[layer] = layerTitle for child in node.getChildren(): child = node.get(child) printedLayer = ' ', child.getChar(), '=>', [ child.get(grandchild).getChar() for grandchild in child.getChildren() ], '\n' layers[layer] += printedLayer self.__printTree(child, layer, layers) if __name__ == '__main__': trie = Trie() list_remark = loadLine('../../res/product_name.txt') for item_data in list_remark: trie.add(item_data) print(trie.wordsWithPrefix('臻享')) print(trie.wordsWithPrefix('长安')) print(trie.wordsWithPrefix('财富'))