예제 #1
0
 def notes_classify_lsi(self, notes_path):
     note_list = loadLine(notes_path)
     print('note_item', note_list[0], len(note_list))
     result_list = []
     for note_item in note_list:
         in_json = json.loads(str(note_item))  # Encode the data
         # print('note_item', in_json['remarks'])
         remark = in_json['remarks']
         # 1.分词
         note_corpus = self.parser.segmentor.segment(remark)
         # 2.转换成bow向量
         note_doc2bow = self.dictionary.doc2bow(note_corpus)
         # 3.计算tfidf值
         note_tfidf = self.tfidf_model[note_doc2bow]
         # 更新LSI的值
         # self.lsi.add_documents(note_tfidf)
         # 4.计算lsi值
         note_lsi = self.lsi[note_tfidf]
         if note_lsi:
             # 返回最相似的样本材料,(index_of_document, similarity) tuples
             print('111111 type inside ', type(note_lsi[0]), note_lsi)
             result_list.append({
                 'remarks': remark,
                 'position': note_lsi[0]
             })
     return result_list
예제 #2
0
    def notes_classify_tfidf(self, notes_path):
        note_list = loadLine(notes_path)
        self.similarity.num_best = 5
        print('note_item', note_list[0], len(note_list))
        result_list = []
        for note_item in note_list:
            in_json = json.loads(str(note_item))  # Encode the data
            # print('note_item', in_json['remarks'])
            remark = in_json['remarks']
            note_corpus = self.parser.segmentor.segment(remark)
            # 生成note的词袋
            note_doc2bow = self.dictionary.doc2bow(note_corpus)
            # 根据之前训练生成的model,生成query的IFIDF值,然后进行相似度计算
            note_tfidf = self.tfidf_model[note_doc2bow]
            # 获取相似度结果
            note_similarity = self.similarity[note_tfidf]

            if note_similarity:
                # 返回最相似的样本材料,(index_of_document, similarity) tuples
                print('111111 type inside ', type(note_similarity[0]),
                      note_similarity)
                result_list.append({
                    'remarks': remark,
                    'position': note_similarity[0]
                })
        return result_list
예제 #3
0
def remove_duplicates(infile, inname, outfile):
    outopen = open(outfile, 'a', encoding='utf-8')
    infile_list = loadLine(infile)
    names = inname.split('/')[-1].split("-")
    print('names', names)
    list_out = []
    for line in infile_list:
        if line not in list_out:
            list_out.append(line)
            outopen.write(line.strip() + '\t' + names[0] + '\n')
    outopen.close()
예제 #4
0
    def word_segmentation(self, input_path, out_path):
        sentence_list = loadLine(input_path)
        outOpen = open(out_path, 'w', encoding='utf-8')

        for sentence_item in sentence_list:
            in_json = json.loads(sentence_item)  # Encode the data
            remark = in_json['remarks']
            # remark = sentence_item
            print('sentence_item', remark)
            result_list = self.parser.segmentor.segment(remark)
            outOpen.write(', '.join(result_list) + '\n')
        outOpen.close()
예제 #5
0
def save_no_index_tag():
    list_remark = loadLine('../../res/foo.txt')

    fp = open('../../assert/triple-extractor.txt', "w", encoding='utf-8', errors='ignore')

    for item_data in list_remark:
        print('item_data', item_data)
        in_json = json.loads(item_data)  # Encode the data
        remarks = in_json['remarks'] if in_json['remarks'] else " "
        custom_state = in_json['custom_state'] if in_json['custom_state'] else "\t\t"
        if remarks != '':
            sentences = split_sents(remarks)
            temp_content = []
            for sentence in sentences:
                words, postags, child_dict_list, roles_dict, format_parse_list, parse_child_dict = parse.parser_main( sentence)
                for temp_item in format_parse_list:
                    if 'HED' in temp_item[0]:
                        temp_content.append(parse_child_dict[temp_item[2]])
            fp.write(custom_state + '-----' + str(words) +'++++++++'+temp_item[1]+'\t'+str(temp_content[0])+ '\n')
        else:
            fp.write(custom_state + '\t' + '' + '\n\n\n')
    fp.close()
예제 #6
0
    def word_property(self, file_path, save_path_property):
        result_list = loadLine(file_path)

        for content_item in result_list:
            in_json = json.loads(str(content_item))  # Encode the data
            # print('note_item', in_json['remarks'])
            remark = in_json['remarks']
            # 获取分词
            words = self.parser.segmentor.segment(remark)
            # print('words', words)
            # 词性标注
            postags = self.parser.postagger.postag(words)
            # print('note_item', postags)
            # print('postags', postags)
            result_list = []

            for i in range(len(postags)):
                save_path_temp = save_path_property + postags[i]
                fp = open(save_path_temp,
                          "a",
                          encoding='utf-8',
                          errors='ignore')
                fp.write(words[i] + '\t')
                fp.close()
예제 #7
0
            for token in layer:
                print
                token,

    def __printTree(self, node, layer, layers):
        layer += 1
        if (layer not in layers and node.hasChildren()):
            layerTitle = "layer", str(layer) + ":\n"
            layers[layer] = layerTitle
        for child in node.getChildren():
            child = node.get(child)
            printedLayer = '        ', child.getChar(), '=>', [
                child.get(grandchild).getChar()
                for grandchild in child.getChildren()
            ], '\n'
            layers[layer] += printedLayer
            self.__printTree(child, layer, layers)


if __name__ == '__main__':

    trie = Trie()
    list_remark = loadLine('../../res/product_name.txt')

    for item_data in list_remark:
        trie.add(item_data)

    print(trie.wordsWithPrefix('臻享'))
    print(trie.wordsWithPrefix('长安'))
    print(trie.wordsWithPrefix('财富'))