Ejemplo n.º 1
0
def extraction_option(text):
    """
    抽取新闻文本中人物,观点
    :param text: 新闻文本
    :return:
    """
    # text = token(text)

    cws_model_path = os.path.join(LTP_TOP_DIR, 'cws.model')  # 分词模型路径,模型名称为`cws.model`
    # segmentor = Segmentor()  # 初始化实例
    # segmentor.load(cws_model_path)  # 加载模型

    pos_model_path = os.path.join(LTP_TOP_DIR, 'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
    postagger = Postagger()  # 初始化实例
    postagger.load(pos_model_path)  # 加载模型

    ner_model_path = os.path.join(LTP_TOP_DIR, 'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`
    recognizer = NamedEntityRecognizer()  # 初始化实例
    recognizer.load(ner_model_path)  # 加载模型

    par_model_path = os.path.join(LTP_TOP_DIR, 'parser.model')  # 依存句法分析模型路径,模型名称为`parser.model`
    parser = Parser()  # 初始化实例
    parser.load(par_model_path)  # 加载模型

    # cut sentences

    sentences = SentenceSplitter.split(text)
    print(f'sentences:{list(sentences)}')
    opnion_results = []
    say_words = load_saywords()

    for sentence in sentences:
        # cut words
        words = cut(sentence)
        # 词性标注
        postags = postagger.postag(words)
        # 命名实体识别
        netags = recognizer.recognize(words, postags)
        # netags = list(netags)
        # 依存句法分析
        arcs = parser.parse(words, postags)

        arcs = [(arc.head, arc.relation) for arc in arcs]

        print(f'words: {list(words)} \npost_tags: {list(postags)}\nnettags: {list(netags)}\narcs:{arcs}')

        # print([(arc.head, arc.relation) for arc in arcs])

        if not [i for i in netags if i in ['S-Nh', 'S-Ni', 'S-Ns']]:
            continue

        hed_index = 0
        for arc in arcs:
            if arc[1] == 'HED':
                break
            hed_index += 1

        print(f"HED: {words[hed_index]}")

        say_word = words[hed_index]
        # if say_word in say_words:

        arcs_new = [(i, arc) for i, arc in enumerate(arcs) if arc[1] == 'SBV']  #SBV 主谓关系 找出主谓关系的句子
        print(arcs_new)

        for arc in arcs_new:
            verb_index = arc[1][0]
            subject = arc[0]
            if words[verb_index - 1] not in say_words:
                continue
            opnion_results.append((words[subject], words[verb_index - 1], ''.join(words[verb_index:])))

        return opnion_results

    postagger.release()
    recognizer.release()
    parser.release()
from pycorenlp import StanfordCoreNLP
from pyltp import NamedEntityRecognizer
import pickle
from pos import store_state
import pandas as pd
#  这个文件的主要作用是找到人名的

file = open(
    r'C:\Users\Terrence\Documents\NLP\extract_news\temp_data\pos.pickle', 'rb')
pos_pd = pd.DataFrame(pickle.load(file))

ner_model = r'C:\Users\Terrence\Documents\NLP\NLP_tools\ltp_data_v3.4.0\ltp_data_v3.4.0\ner.model'
reg = NamedEntityRecognizer()
reg.load(ner_model)
reg_dict = {'sentence': [], 'tags': [], 'ner': []}
for index, row in pos_pd.iterrows():
    ner = reg.recognize(row['sentence'], row['tags'])
    reg_dict['sentence'].append(row['sentence'])
    reg_dict['tags'].append(row['tags'])
    reg_dict['ner'].append(list(ner))

store_state(
    r'C:\Users\Terrence\Documents\NLP\NLP_tools\ltp_data_v3.4.0\ltp_data_v3.4.0\ner.pickle',
    reg_dict)
file = open(
    r'C:\Users\Terrence\Documents\NLP\NLP_tools\ltp_data_v3.4.0\ltp_data_v3.4.0\ner.pickle',
    'rb')

reg.release()

# 这里是尝试用语法来进行chunk
Ejemplo n.º 3
0
    def __init__(self):
        """
        init method required. set batch_size, and load some resources.
        """
        self.batch_size = 2048

        FLAGS = tf.app.flags.FLAGS
        tf.app.flags.DEFINE_string("ckpt_dir", "predictor/checkpoint/",
                                   "checkpoint location for the model")
        tf.app.flags.DEFINE_string(
            "ckpt_dir_accu", "./all_data/textcnn_final/checkpoint_accu/",
            "checkpoint location for the model")
        tf.app.flags.DEFINE_string("ckpt_dir_law",
                                   "./all_data/textcnn_final/checkpoint_law/",
                                   "checkpoint location for the model")
        tf.app.flags.DEFINE_string(
            "ckpt_dir_imprision",
            "./all_data/textcnn_final/checkpoint_imprision/",
            "checkpoint location for the model")

        tf.app.flags.DEFINE_string("vocab_word_path",
                                   "predictor/word_freq.txt",
                                   "path of word vocabulary.")
        tf.app.flags.DEFINE_string("accusation_label_path",
                                   "predictor/accu.txt",
                                   "path of accusation labels.")
        tf.app.flags.DEFINE_string("article_label_path", "predictor/law.txt",
                                   "path of law labels.")
        tf.app.flags.DEFINE_string("stopwords_file", "predictor/stopword.txt",
                                   "path of stopword")

        tf.app.flags.DEFINE_float("learning_rate", 0.001, "learning rate")
        tf.app.flags.DEFINE_integer(
            "decay_steps", 1000, "how many steps before decay learning rate.")
        tf.app.flags.DEFINE_float("decay_rate", 1.0,
                                  "Rate of decay for learning rate.")
        tf.app.flags.DEFINE_integer("sentence_len", 400, "max sentence length")
        tf.app.flags.DEFINE_integer("num_sentences", 16, "number of sentences")
        tf.app.flags.DEFINE_integer("embed_size", 64, "embedding size")  #64
        tf.app.flags.DEFINE_integer("hidden_size", 256, "hidden size")  #128
        tf.app.flags.DEFINE_integer(
            "num_filters", 128,
            "number of filter for a filter map used in CNN.")  #128

        tf.app.flags.DEFINE_integer("embed_size_dpcnn", 64, "embedding size")
        tf.app.flags.DEFINE_integer("hidden_size_dpcnn", 128, "hidden size")
        #tf.app.flags.DEFINE_integer("num_filters_big", 128, "number of filter for a filter map used in CNN.")
        tf.app.flags.DEFINE_string(
            "model_dpcnn", "dp_cnn",
            "name of model:han,c_gru,c_gru2,gru,text_cnn")

        tf.app.flags.DEFINE_string("ckpt_dir_dpcnn",
                                   "./checkpoint_dpcnn_big/checkpoint/",
                                   "checkpoint location for the model")

        tf.app.flags.DEFINE_boolean(
            "is_training", False,
            "is traning.true:tranining,false:testing/inference")
        tf.app.flags.DEFINE_string(
            "model", "text_cnn", "name of model:han,c_gru,c_gru2,gru,text_cnn")
        #tf.app.flags.DEFINE_boolean("is_training_flag", False, "is traning.true:tranining,false:testing/inference")
        tf.app.flags.DEFINE_string('cws_model_path', 'predictor/cws.model',
                                   'cws.model path')
        tf.app.flags.DEFINE_string('pos_model_path', 'predictor/pos.model',
                                   'pos.model path')
        tf.app.flags.DEFINE_string('ner_model_path', 'predictor/ner.model',
                                   'ner.model path')
        tf.app.flags.DEFINE_string('gpu', '0', 'help to select gpu divice')
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.gpu
        segm = Segmentor()
        segm.load(FLAGS.cws_model_path)  # ltp 模型
        post = Postagger()
        post.load(FLAGS.pos_model_path)
        recognizer = NamedEntityRecognizer()
        recognizer.load(FLAGS.ner_model_path)
        self.ltp_model = [segm, post, recognizer]

        filter_sizes = [2, 3, 4, 5
                        ]  #,6,7,8]#[2,3,4,5]#[6, 7, 8, 9, 10]  # [30,40,50] #8
        #filter_sizes_big= [2,3,4,5]#,6,7,8]#[2,3,4,5]#[6, 7, 8, 9, 10]  # [30,40,50] #8

        stride_length = 1

        #1.load label dict, restore model from checkpoint
        # 1.load label dict
        self.vocab_word2index = load_word_vocab(FLAGS.vocab_word_path)
        accusation_label2index = load_label_dict_accu(
            FLAGS.accusation_label_path)
        articles_label2index = load_label_dict_article(
            FLAGS.article_label_path)

        deathpenalty_label2index = {True: 1, False: 0}
        lifeimprisonment_label2index = {True: 1, False: 0}
        vocab_size = len(self.vocab_word2index)
        accusation_num_classes = len(accusation_label2index)
        article_num_classes = len(articles_label2index)
        deathpenalty_num_classes = len(deathpenalty_label2index)
        lifeimprisonment_num_classes = len(lifeimprisonment_label2index)

        # 2.restore checkpoint
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True

        graph = tf.Graph().as_default()
        with graph:
            self.model = HierarchicalAttention(accusation_num_classes,
                                               article_num_classes,
                                               deathpenalty_num_classes,
                                               lifeimprisonment_num_classes,
                                               FLAGS.learning_rate,
                                               self.batch_size,
                                               FLAGS.decay_steps,
                                               FLAGS.decay_rate,
                                               FLAGS.sentence_len,
                                               FLAGS.num_sentences,
                                               vocab_size,
                                               FLAGS.embed_size,
                                               FLAGS.hidden_size,
                                               num_filters=FLAGS.num_filters,
                                               model=FLAGS.model,
                                               filter_sizes=filter_sizes,
                                               stride_length=stride_length)
            saver = tf.train.Saver()
            sess = tf.Session(config=config)
            saver.restore(sess,
                          tf.train.latest_checkpoint(FLAGS.ckpt_dir_accu))
            self.sess_accu = sess

            saver_law = tf.train.Saver()
            sess_law = tf.Session(config=config)
            saver_law.restore(sess_law,
                              tf.train.latest_checkpoint(FLAGS.ckpt_dir_law))
            self.sess_law = sess_law

        self.FLAGS = FLAGS
Ejemplo n.º 4
0
def mingming_shiti(words,postags):
    """命名实体。机构名(Ni)人名(Nh)地名(Ns)"""
    recognizer = NamedEntityRecognizer()
    recognizer.load(os.path.join(MODELDIR, "ner.model"))
    netags = recognizer.recognize(words, postags)
    print ("\t".join(netags))
                seg_list.insert(i + 1, '\n')
                seg_list.insert(i + 2, remains2)
        # 将拆开的数字连接起来
        if seg_list[i].isdigit() and seg_list[i + 1].isdigit():
            seg_list[i] = seg_list[i] + seg_list[i + 1]
            del seg_list[i + 1]

        i += 1

    # 词性标注
    postagger = Postagger()  # 初始化词性标注实例
    postagger.load(pos_model_path)  # 加载模型
    postags = postagger.postag(seg_list)  # 词性标注

    # 命名实体识别
    recognizer = NamedEntityRecognizer()  # 初始化命名实体识别实例
    recognizer.load(ner_model_path)  # 加载模型
    netags = recognizer.recognize(seg_list, postags)  # 命名实体识别

    # 写入结果
    f2 = open("分词_词性标注_命名实体识别_结果.txt", "w", encoding='utf-8')
    for word, postag, netag in zip(seg_list, postags, netags):
        if word == '\n':
            f2.write('\n')
        else:
            f2.write(word + " " + postag + " " + netag + "\n")
    f2.close()

    # 释放模型
    segmentor.release()
    postagger.release()
Ejemplo n.º 6
0
    def __init__(self, configure):

        self.system_logger = logging.getLogger("system_log")

        self._sentence_splitter = SentenceSplitter

        self._segmentor = Segmentor()
        self._segmentor.load_with_lexicon(
            configure.nlp_data_root + "/cws.model",
            configure.nlp_data_root + "/cws.tsv")

        self._segmentor_without_dictionary = Segmentor()
        self._segmentor_without_dictionary.load(configure.nlp_data_root +
                                                "/cws.model")

        self._postagger = Postagger()
        self._postagger.load(configure.nlp_data_root + "/pos.model")

        self._ner_recognizer = NamedEntityRecognizer()
        self._ner_recognizer.load(configure.nlp_data_root + "/ner.model")

        self._dependency_parser = Parser()
        self._dependency_parser.load(configure.nlp_data_root + "/parser.model")

        self._srl = SementicRoleLabeller()
        self._srl.load(configure.nlp_data_root + "/pisrl.model")

        self._stopwords_file = configure.nlp_data_root + "/stopwords.txt"
        self._stopwords_set = set([
            tk.strip() for tk in codecs.open(self._stopwords_file, 'r',
                                             'utf-8').read().splitlines()
            if tk.strip() != ""
        ])

        self.entity_type_mapping_file = configure.entity_type_mapping_file
        self.entity_type_mapping = defaultdict()
        for line in codecs.open(self.entity_type_mapping_file, 'r',
                                'utf-8').read().splitlines():
            elems = line.split("\t")
            if len(elems) != 2:
                log_str = "Format error in file [%s] !!!\n" % self.entity_type_mapping_file
                self.system_logger.error(log_str)
                sys.stderr.write(log_str)
                continue
            self.entity_type_mapping[int(
                elems[0])] = "<" + str(elems[0]) + "_" + elems[1].strip() + ">"
        self.all_entity_replacements = list(self.entity_type_mapping.values())

        self.entity_type_exclusion_file = configure.entity_type_exclusion_file
        self.entity_type_exclusion_mapping = defaultdict()
        for line in codecs.open(self.entity_type_exclusion_file, 'r',
                                'utf-8').read().splitlines():
            elems = line.split("\t")
            if len(elems) != 2:
                log_str = "Format error in file [%s] !!!\n" % self.entity_type_exclusion_file
                self.system_logger.error(log_str)
                sys.stderr.write(log_str)
                continue
            self.entity_type_exclusion_mapping[int(
                elems[0])] = "<" + str(elems[0]) + "_" + elems[1].strip() + ">"
        self.entity_type_exclusion_set = set(
            self.entity_type_exclusion_mapping.keys())

        trie_tree, lexicon = generate_trie_tree(configure.nlp_data_root +
                                                "trust_list.tsv")
        self._lexicon = lexicon
        self._trie_tree = trie_tree

        self.entity_linker = EntityLinker()

        self.dialog_act_classifier = DialogActClassifier(
            configure.dialog_act_classifier_configure)

        self.emotion_classifier = EmotionClassifier(
            configure.emotion_classifier_configure)

        self.yes_no_classifier = YesNoClassifier(
            configure.attitude_classifier_configure)
        self.like_dislike_classifier = LikeDislikeClassifier(
            configure.attitude_classifier_configure)

        self.question_classifier = QuestionClassifier(
            configure.question_classifier_configure)
        self.question_response = ""

        self.noun_phrase_generator = noun_phrase_generator

        self.segmentor_plus = segmentor_plus

        self.turn_on = configure.turn_on
Ejemplo n.º 7
0
def getRelation(paragraph):
    """
	paragraph: a list of string, each string is a sentence
	return: a list of relations and a dict which records the number of occurrence of differents DSNF
	"""
    relations = []
    dict_DSNF = {
        'num_DSNF1': 0,
        'num_DSNF2': 0,
        'num_DSNF3': 0,
        'num_DSNF7': 0,
    }

    segmentor = Segmentor()
    segmentor.load(os.path.join(MODELDIR, "cws.model"))
    postagger = Postagger()
    postagger.load(os.path.join(MODELDIR, "pos.model"))
    parser = Parser()
    parser.load(os.path.join(MODELDIR, "parser.model"))
    recognizer = NamedEntityRecognizer()
    recognizer.load(os.path.join(MODELDIR, "ner.model"))

    for iteration, sentence in enumerate(paragraph):
        print("evaluate the " + str(iteration + 1) + "-th sentences")

        sentence = SentenceSplitter.split(sentence)[0]

        words = segmentor.segment(sentence)
        # print("\t".join(words))

        postags = postagger.postag(words)
        # list-of-string parameter is support in 0.1.5
        # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"])
        # print("\t".join(postags))

        arcs = parser.parse(words, postags)

        # print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))

        netags = recognizer.recognize(words, postags)
        # print("\t".join(netags))

        # labeller = SementicRoleLabeller()
        # labeller.load(os.path.join(MODELDIR, "pisrl.model"))
        # roles = labeller.label(words, postags, arcs)
        # for role in roles:
        #     print(role.index, "".join(
        #             ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]))

        entityList = findEntities(netags)
        # print(entityList)
        entities = []
        for i in entityList:
            l = ''
            for j in i:
                l += words[j]
            entities.append(l)
        print("entities in " + str(iteration + 1) + "-th sentence : ",
              entities)

        DSNF1_ret = DSNF1(arcs, entityList, words, netags)
        DSNF2_ret = DSNF2(arcs, entityList, words)
        DSNF3_ret = DSNF3(arcs, entityList, words, postags)
        DSNF7_ret = DSNF7(arcs, entityList, words)
        # print("DSNF1 result: ", DSNF1_ret)
        # print("DSNF2 result: ", DSNF2_ret)
        # print("DSNF3 result: ", DSNF3_ret)
        # print("DSNF7 result: ", DSNF7_ret)
        relation = []
        for r in DSNF1_ret:
            dict_DSNF['num_DSNF1'] += 1
            relation.append(r)
            relations.append(r)
        for r in DSNF2_ret:
            dict_DSNF['num_DSNF2'] += 1
            relation.append(r)
            relations.append(r)
        for r in DSNF3_ret:
            dict_DSNF['num_DSNF3'] += 1
            relation.append(r)
            relations.append(r)
        for r in DSNF7_ret:
            dict_DSNF['num_DSNF7'] += 1
            relation.append(r)
            relations.append(r)
        print("with entities relation: ", relation)
        print("--" * 30)

    segmentor.release()
    postagger.release()
    parser.release()
    recognizer.release()
    # labeller.release()

    return relations, dict_DSNF
Ejemplo n.º 8
0
    def get_vec(self, M_class):
        segmentor = Segmentor()
        segmentor.load('cws.model')
        postagger = Postagger()  # 初始化实例
        postagger.load('pos.model')  # 加载模型
        recognizer = NamedEntityRecognizer()  # 初始化实例
        recognizer.load('ner.model')  # 加载模型
        labeller = SementicRoleLabeller()  # 初始化实例
        labeller.load('pisrl.model')  # 加载模型
        parser = Parser()
        parser.load('parser.model')

        def merge_base(cut, pos, ner, arcs_list):
            base_list = []
            cw = []
            dr = []
            cw_w = []
            for i in range(len(cut)):
                base_list.append(cut[i] + '/' + pos[i])
            for i in range(len(arcs_list)):
                cw.append(arcs_list[i].split(':')[0])
                dr.append(arcs_list[i].split(':')[1])
            # for i in range(len(cw)):
            #     if cw[i] != '0':
            #         base_list.append(cut[i] + '/' + cut[int(cw[i]) - 1])
            #     else:
            #         base_list.append(cut[i] + '/')
            cw = list(set(cw))
            dr = list(set(dr))
            for ci in cw:
                base_list.append(cut[int(ci) - 1])
            for di in dr:
                base_list.append(di)
            for i in range(len(cut)):
                base_list.append(cut[i] + '/' + ner[i])

            # print(base_list)
            # base_list.append(cut[i])
            # print(base_list)
            return base_list

        def cut_segment(data, stop_words):
            all_words = []
            type_dict = {}  # 类-词
            type_dict_freq = {}  # 类-词-词频
            type_dict_num = {}  # 类-文档数量
            type_dict_doc = {}  # 类-词-文档数
            all_num = 0
            for k in data.keys():
                words = []
                type_dict[k] = []
                words_freq = {}
                for line in data[k]:
                    cut_line = '\t'.join(segmentor.segment(line))
                    word_list = cut_line.split('\t')  # 分词
                    # print(word_list)
                    words_list = []
                    for word in word_list:
                        if word not in self.stop_word:
                            words_list.append(word)
                    # print(words_list)
                    postags = postagger.postag(words_list)  # 词性标注
                    pos_line = '\t'.join(postags)
                    pos_list = pos_line.split('\t')

                    netags = recognizer.recognize(words_list,
                                                  pos_list)  # 命名实体识别
                    ner_line = '\t'.join(netags)
                    ner_list = ner_line.split('\t')

                    arcs = parser.parse(words_list, pos_list)  # 句法分析
                    arcs_line = "\t".join("%d:%s" % (arc.head, arc.relation)
                                          for arc in arcs)
                    arcs_list = arcs_line.split('\t')

                    words_list = merge_base(words_list, pos_list, ner_list,
                                            arcs_list)
                    # print(words_list)
                    for word in words_list:
                        is_adddoc = True
                        if word not in stop_words:
                            words.append(word)
                            if word not in words_freq.keys():
                                words_freq[word] = 1
                            else:
                                words_freq[word] += 1
                            if is_adddoc:  # 词出现的问题树
                                if word not in type_dict_doc.keys():
                                    type_dict_doc[word] = 1
                                else:
                                    type_dict_doc[word] += 1
                                is_adddoc = False
                words = list(set(words))
                type_dict[k] = words
                type_dict_freq[k] = words_freq
                type_dict_num[k] = len(data[k])
                all_num += len(data[k])
                print(
                    '类' + str(k) + '\t问题数量为:' + str(type_dict_num[k]) +
                    '\t单词数量为:', len(words))
                all_words = list(set(words) | set(all_words))
                # for wd in type_dict_doc.keys():
                #     print(wd,'出现文档数为:',type_dict_doc[wd])
            print('所有单词数量为:', len(all_words))
            print('所有问题数量为:', all_num)
            self.N = all_num
            # for i in type_dict.keys():
            #     print(i,type_dict[i])
            # for i in type_dict_freq.keys():
            #     print(i,type_dict_freq[i])
            # print(sorted(type_dict_doc.items(),key=lambda d: d[1], reverse=True))
            return type_dict_num, type_dict, type_dict_freq, type_dict_doc

        def choose_features(type_dict_freq):
            def word_freq():  # 词频选取特征
                word_features = []
                for k in type_dict_freq.keys():
                    # print(k,type_dict_freq[k])
                    result = sorted(type_dict_freq[k].items(),
                                    key=lambda d: d[1],
                                    reverse=True)
                    # print(k,result[:200])
                    if M_class:
                        # for r in result[:300]:
                        for r in result:
                            word_features.append(r[0])
                    else:
                        # for r in result[:30]:
                        for r in result:
                            word_features.append(r[0])
                word_features = list(set(word_features))
                word_features = sorted(word_features)
                # print(word_features)
                return word_features

            word_features = []
            word_features = word_freq()
            # word_features = word_chi()
            return word_features

        def get_tf_idf(word_features, type_dict_freq, type_dict_doc):
            tf_idf = {}
            for k in type_dict_freq.keys():
                tf_idf[k] = [0 for i in range(len(word_features))]
                # print(type_dict_freq[k].keys())
                for w_f in word_features:
                    if w_f in type_dict_freq[k]:
                        # print(w_f,word_features.index(w_f),type_dict_freq[k][w_f])
                        tf_idf[k][word_features.index(
                            w_f)] = type_dict_freq[k][w_f]
                # print(word_features)
                # print(tf_idf[k])

            k_d = {}
            for k in tf_idf.keys():
                k_d[k] = 0
                for i in tf_idf[k]:
                    k_d[k] += i

            for w_f in word_features:  # 调整idf
                for k in tf_idf.keys():
                    tf_idf[k][word_features.index(w_f)] /= k_d[k]
                    idf = math.log(self.N / type_dict_doc[w_f])
                    tf_idf[k][word_features.index(w_f)] *= float(idf)

                    # tf_idf[k][word_features.index(w_f)] = (tf_idf[k][word_features.index(w_f)]>0)
                    # if tf_idf[k][word_features.index(w_f)]:
                    #     tf_idf[k][word_features.index(w_f)]  = 1
                    # else:
                    #     tf_idf[k][word_features.index(w_f)] = 0
            return tf_idf

        def write_vec(type_dict_freq, word_features, train_or_test, M_class):
            w_dict = {}
            types = {}
            trains = []
            ti = 0
            for i in type_dict_freq.keys():  # 类别编码
                ti += 1
                types[i] = ti
            if not M_class:
                types['OBJ_ADDRESS'] = len(types.keys()) + 1
                with open('types_S.txt', 'w') as fr:
                    for k in types.keys():
                        fr.write(str(types[k]) + '\t' + str(k))
                        fr.write('\n')
            else:
                with open('types_M.txt', 'w') as fr:
                    for k in types.keys():
                        fr.write(str(types[k]) + '\t' + str(k))
                        fr.write('\n')
            if train_or_test:
                if M_class:
                    w_dict = self.MC_dict
                else:
                    w_dict = self.SC_dict
            else:
                # w_dict =  self.load_data('question_classification/test_questions.txt',M_class)
                w_dict = {}
                type = ['DES', 'HUM', 'LOC', 'NUM', 'OBJ', 'TIME', 'UNKNOWN']
                for t in type:
                    w_dict[t] = []
                # ftra = open('data/train.json','r')
                ftra = open('train_test.txt', 'r')
                for line in ftra:
                    line_list = line.replace('"question":"', 'cut_f').replace(
                        ', "answer_pid":', 'cut_f').split('cut_f')
                    w_dict['DES'].append(line_list[1][:-2])
                # for line in ftra:
                #     line_list = line.replace('{"question": ', '').replace('"pid": ', 'cut_f').replace(
                #         '"answer_sentence": [',
                #         'cut_f').replace(
                #         '], "answer": ', 'cut_f').replace(' "qid": ', 'cut_f').replace('}', '').split('cut_f')
                #
                #     w_dict['DES'].append(line_list[0])
            for k in w_dict.keys():
                print(w_dict[k])
                for train in w_dict[k]:
                    # print(train)
                    cut_line = '\t'.join(segmentor.segment(train))
                    words_list = cut_line.split('\t')  # 分词
                    postags = postagger.postag(words_list)  # 词性标注
                    pos_line = '\t'.join(postags)
                    pos_list = pos_line.split('\t')

                    netags = recognizer.recognize(words_list,
                                                  pos_list)  # 命名实体识别
                    ner_line = '\t'.join(netags)
                    ner_list = ner_line.split('\t')

                    arcs = parser.parse(words_list, pos_list)  # 句法分析
                    arcs_line = "\t".join("%d:%s" % (arc.head, arc.relation)
                                          for arc in arcs)
                    arcs_list = arcs_line.split('\t')
                    # print(arcs_list)
                    words_list = merge_base(words_list, pos_list, ner_list,
                                            arcs_list)

                    train_vec = str(types[k]) + ' '
                    for i in words_list:
                        if i in word_features:
                            if train_vec.find(
                                    str(word_features.index(i)) + ':1') == -1:
                                train_vec += str(
                                    word_features.index(i)) + ':1 '
                    trains.append(train_vec)
            if M_class:
                if train_or_test:
                    with open('train_M.txt', 'w') as fr:
                        print('向量数:', len(trains))
                        for tv in trains:
                            fr.write(tv)
                            fr.write('\n')
                else:
                    with open('test_M.txt', 'w') as fr:
                        print('向量数:', len(trains))
                        for tv in trains:
                            fr.write(tv)
                            fr.write('\n')
            else:
                if train_or_test:
                    with open('train_S.txt', 'w') as fr:
                        print('向量数:', len(trains))
                        for tv in trains:
                            fr.write(tv)
                            fr.write('\n')
                else:
                    with open('test_S.txt', 'w') as fr:
                        print('向量数:', len(trains))
                        for tv in trains:
                            fr.write(tv)
                            fr.write('\n')

        if M_class:
            type_dict_num, type_dict, type_dict_freq, type_dict_doc = cut_segment(
                self.MC_dict, self.stop_word)
        else:
            type_dict_num, type_dict, type_dict_freq, type_dict_doc = cut_segment(
                self.SC_dict, self.stop_word)

        word_features = choose_features(type_dict_freq)
        print(word_features)
        print('特征数量:', len(word_features))
        # tf_idf = get_tf_idf(word_features,type_dict_freq,type_dict_doc)
        # return tf_idf,type_dict_num,word_features
        # write_vec(type_dict_freq, word_features, True, M_class)
        write_vec(type_dict_freq, word_features, False, M_class)
Ejemplo n.º 9
0
def main():
    clf = joblib.load('model.pkl')
    pf = list()
    ne1s = list()
    ne2s = list()
    model = models.Word2Vec.load_word2vec_format('cn.cbow.bin', binary=True, unicode_errors = 'ignore')
    segmentor = Segmentor()
    postagger = Postagger()
    recognizer = NamedEntityRecognizer()
    segmentor.load("ltp_data/cws.model")
    postagger.load("ltp_data/pos.model")
    recognizer.load("ltp_data/ner.model")
    ifsen = 1
    input = open("trelationExtractionTrainingCorpus.txt", "r")
    outputfv = open('feature_vector.txt', 'w')
    outputfr = open('feature_result.txt', 'w')
    outputp = open('predict_result.txt', 'w')
    line = input.readline()
    senNo = 0
    while line:
        if line[0] == '|':
	    namedEntityBegin = list()
	    namedEntityEnd = list()
	    namedEntityCount = 0
	    i = 0
            for netag in netags:
                if netag == 'O':
		    i = i + 1
                    continue
                if netag == 'S-Ni' or netag == 'S-Nh' or netag == 'S-Ns':
                    namedEntityBegin.append(i)
                    namedEntityEnd.append(i)
                    namedEntityCount = namedEntityCount + 1
		    i = i + 1
                    continue
                if netag == 'B-Ni' or netag == 'B-Nh' or netag == 'B-Ns':
                    namedEntityBegin.append(i)
                    namedEntityCount = namedEntityCount + 1
		    i = i + 1
                    continue
                if netag == 'E-Ni' or netag == 'E-Nh' or netag == 'E-Ns':
                    namedEntityEnd.append(i)
    		    i = i + 1
                    continue
                else:
 		    i = i + 1
                    continue
            for i in range(namedEntityCount):
                j = namedEntityBegin[i]
                while (j<=namedEntityEnd[i]):
                    print words[j],
		    j = j + 1
                print '\n'
            for i in range(namedEntityCount):
                for j in range(namedEntityCount):
                    if j > i:
			print '%d, %d' % (i,j)
                        neType1 = neType(netags[namedEntityBegin[i]])
                        neType2 = neType(netags[namedEntityBegin[j]])
                        if neType1*neType2>0 or neType1+neType2==0:
                            continue
                        featureVector = list()
                        featureVector.append(neType1)
                        featureVector.append(neType2)
                        if namedEntityBegin[i] < 3:
                            leftWindowScale = namedEntityBegin[i]
                        else:
                            leftWindowScale = 2
                        featureVector.append(leftWindowScale)
                        if leftWindowScale == 0:
                            for k in range(300):
                                featureVector.append(0)
                                featureVector.append(0)
                        elif leftWindowScale == 1:
                            try:
                                t = model[words[namedEntityBegin[i]-1].decode('utf-8')]
				for k in t:
				    featureVector.append(k)
                            except:
                                for k in range(300):
                                    featureVector.append(0)
                            for k in range(300):
                                featureVector.append(0)
                        else:
                            for k in range(2):
                                try:
                                    t = model[words[namedEntityBegin[i]-k-1].decode('utf-8')]
				    for ktemp in t:
					featureVector.append(ktemp)
                                except:
                                    for ktemp in range(300):
                                        featureVector.append(0)
                        wordsLen = len(words)
                        rightWindowScale = wordsLen - namedEntityEnd[j]
                        if rightWindowScale > 2:
                            rightWindowScale = 2
                        featureVector.append(rightWindowScale)
                        if rightWindowScale == 0:
                            for k in range(300):
                                featureVector.append(0)
                                featureVector.append(0)
                        elif rightWindowScale == 1:
                            try:
                                t = model[words[namedEntityEnd[j]+1].decode('utf-8')]
				for k in t:
				    featureVector.append(k)
                            except:
                                for k in range(300):
                                    featureVector.append(0)
                            for k in range(300):
                                featureVector.append(0)
                        else:
                            for k in range(2):
                                try:
                                    t = model[words[namedEntityEnd[j]+1+k].decode('utf-8')]
				    for ktemp in t:
					featureVector.append(ktemp)
                                except:
                                    for ktemp in range(300):
                                        featureVector.append(0)
                        wordBetweenCount = namedEntityBegin[j] - namedEntityEnd[i] - 1
                        featureVector.append(wordBetweenCount)
                        if wordBetweenCount == 0:
                            for k in range(10):
                                for ktemp in range(300):
                                    featureVector.append(0)
                        elif wordBetweenCount <= 10:
                            for k in range(wordBetweenCount):
                                try:
                                    t = model[words[namedEntityEnd[i]+k+1].decode('utf-8')]
				    for ktemp in t:
					featureVector.append(ktemp)
                                except:
                                    for ktemp in range(300):
                                        featureVector.append(0)
                            for k in range(10-wordBetweenCount):
                                for ktemp in range(300):
                                    featureVector.append(0)
                        else:
                            for k in range(5):
                                try:
                                    t = model[words[namedEntityEnd[i]+k+1].decode('utf-8')]
				    for ktemp in t:
					featureVector.append(ktemp)
                                except:
                                    for ktemp in range(300):
                                        featureVector.append(0)
                            for k in range(5):
                                try:
                                    t = model[words[namedEntityBegin[j]-5+k].decode('utf-8')]
				    for ktemp in t:
					featureVector.append(ktemp)
                                except:
                                    for ktemp in range(300):
                                        featureVector.append(0)
			pf.append(featureVector)
                        neIndex = namedEntityBegin[i]
                        ne1 = words[neIndex]
                        while neIndex < namedEntityEnd[i]:
                            neIndex = neIndex + 1
                            ne1 = ne1 + words[neIndex]
			ne1s.append(ne1)
                        neIndex = namedEntityBegin[j]
                        ne2 = words[neIndex]
                        while neIndex < namedEntityEnd[j]:
                            neIndex = neIndex + 1
                            ne2 = ne2 + words[neIndex]
			ne2s.append(ne2)
                        ifRelation = 0
                        for k in range(relationCount):
                            if (ne1 == relations[k][0] or ne1 == relations[k][1]) and (ne2 == relations[k][0] or ne2 == relations[k][1]) and (ne1 != ne2):
                                ifRelation = 1
                                break
                        if ifRelation == 0:
                            featureResult = 3
                        else:
                            featureResult = relationType(relations[k][2])
                        for k in featureVector:
                            outputfv.write('%f ' % k)
                        outputfv.write('\n')
                        outputfr.write(str(featureResult))
			outputfr.write('\n')
			print featureResult
            ifsen = 1
            line = input.readline()
	    print 'senNo: %d' % senNo
	    senNo = senNo + 1
            continue
        if ifsen == 1:
            print line
	    line = unicodedata.normalize('NFKC', line.decode('utf-8')).encode('utf-8')
            words = segmentor.segment(line)
            postags = postagger.postag(words)
            netags = recognizer.recognize(words, postags)
            print "|".join(words)
            print "|".join(postags)
            print "|".join(netags)
            ifsen = 0
            relationCount = 0
            relations = list()
        else:
            relation = line.split(',')
            relations.append(relation)
            relationCount = relationCount + 1
            print "|".join(relations[relationCount-1])
	    print relations[relationCount-1][2]
        line = input.readline()
    segmentor.release()
    postagger.release()
    recognizer.release()
    input.close()
    outputfv.close()
    outputfr.close()
    pred_res = clf.predict(pf)
    for i in pred_res:
	outputp.write(str(i))
	outputp.write('\n')
 def get_ner(self, word_list, postag_list, model):
     recognizer = NamedEntityRecognizer()
     recognizer.load(model)
     netags = recognizer.recognize(word_list, postag_list)  # 命名实体识别
     recognizer.release()  # 释放模型
     self.ner_list = list(netags)
Ejemplo n.º 11
0
    def bayes(self, M_class):
        def merge_base(cut, pos, ner):
            base_list = []
            for i in range(len(cut)):
                base_list.append(cut[i] + '+' + pos[i] + '+' + ner[i])
            # print(base_list)
            return base_list

        def each_q(words_q):
            # print(words_q)
            q_a = {}
            for t in types:
                q_a[t] = types_f[t]
            for word in words_q:
                if word not in self.stop_word:
                    if word in word_features:
                        word_index = word_features.index(word)
                        # print(word, word_index)
                        for t in types:
                            q_a[t] *= tf_idf[t][word_index]
            result_type = sorted(q_a.items(), key=lambda d: d[1],
                                 reverse=True)[0]
            # print(result_type[0])
            return result_type[0]

        all_qa = []
        tf_idf, type_dict_num, word_features = self.get_vec(M_class)
        types = list(type_dict_num.keys())
        types_f = {}
        for t in types:
            types_f[t] = type_dict_num[t] / self.N
            # print(types_f[t])
        segmentor = Segmentor()
        segmentor.load('cws.model')
        postagger = Postagger()  # 初始化实例
        postagger.load('pos.model')  # 加载模型
        recognizer = NamedEntityRecognizer()  # 初始化实例
        recognizer.load('ner.model')  # 加载模型
        # q = 'API的全称是什么?'
        test_dict = self.load_data(
            'question_classification/test_questions.txt', True)

        for i in test_dict.keys():
            for j in range(len(test_dict[i])):
                q = test_dict[i][j]
                # cut_q = '\t'.join(segmentor.segment(q))
                # words_q = cut_q.split('\t')
                cut_line = '\t'.join(segmentor.segment(q))
                words_list = cut_line.split('\t')  # 分词

                postags = postagger.postag(words_list)  # 词性标注
                pos_line = '\t'.join(postags)
                pos_list = pos_line.split('\t')

                netags = recognizer.recognize(words_list, pos_list)  # 命名实体识别
                ner_line = '\t'.join(netags)
                ner_list = ner_line.split('\t')

                words_list = merge_base(words_list, pos_list, ner_list)
                type = each_q(words_list)
                # type = each_q(words_q)
                all_qa.append(type + '\t' + q)
        with open('qc_answer.txt', 'w') as fr:
            for i in all_qa:
                fr.write(i)
                fr.write('\n')

        self.get_precesion('question_classification/test_questions.txt',
                           'qc_answer.txt', M_class)
 def ner(self, wordlist, postag_list):
     recognizer = NamedEntityRecognizer()
     recognizer.load(self.ner_model_path)
     netag_list = list(recognizer.recognize(wordlist, postag_list))
     return netag_list
Ejemplo n.º 13
0
Archivo: QA.py Proyecto: Goerwa/QA
    def get_ner(self):
        def is_in(word, word_set):
            for i in word_set:
                if i.find(i) != -1:
                    return True
            return False

        def get_feature(type, pos):
            r_str = ''
            r_str += str(100 + all_types.index(pos + '_' + type)) + ':1 '
            r_str += str(100 + all_types.index(pos)) + ':1 '
            return r_str

        def get_answer_pos(l, answer):
            r = [0 for n in range(len(l))]
            # print(answer[1:-2])
            r_str = ''
            i = 0
            while r_str != answer[1:-2] and i < len(l):
                # print(r_str)
                if l[i] in answer:
                    r_str += l[i]
                    r[i] = 1
                else:
                    r_str = ''
                    for j in range(i):
                        r[j] = 0
                i += 1
            return r

        def adjust_list(l, words):
            l.insert(0, '"')
            n = len(l)
            cut_list = []
            cut_list.append(l[0])
            for i in range(1, n):
                a = ''
                b = l[i]
                if cut_list[-1] + l[i] in words:
                    cut_list[-1] = cut_list[-1] + l[i]
                    continue
                else:
                    cut_list.append(l[i])
                # while l[i] not in words and len(l[i]) > 1:
                #     a = l[i][-1] + a
                #     l[i] = l[i][:-1]
                # print(b)
                # if l[i] in words:
                #     cut_list.append(l[i])
                # if a != '':
                #     if a in words:
                #         cut_list.append(a)
                # else:
                #     cut_list.append(b)
            # print(cut_list)
            cut_list = cut_list[1:]
            return cut_list

        segmentor = Segmentor()
        segmentor.load('cws.model')
        postagger = Postagger()  # 初始化实例
        postagger.load('pos.model')  # 加载模型
        recognizer = NamedEntityRecognizer()  # 初始化实例
        recognizer.load('ner.model')  # 加载模型
        parser = Parser()
        parser.load('parser.model')
        labeller = SementicRoleLabeller()  # 初始化实例
        labeller.load('pisrl.model')  # 加载模型
        all_types = []
        ftype = open('AC/qa_types1.txt', 'r')
        for line in ftype:
            all_types.append(line[:-1])
        fti = open('AC/tf-idf.txt', 'r')
        all_word = []
        for line in fti:
            k = line[:-1].split('\t')[0]
            all_word.append(k)
        all_word = set(all_word)
        j = 0
        word_feature = []
        word_group = []
        word_all = []
        for k in self.data.keys():
            q_list = []
            q_pos_list = []
            q_sbv = []
            q_vob = []
            q_v = []
            q_att1 = []
            q_att2 = []
            cut_line = '\t'.join(segmentor.segment(self.data[k][0]))
            word_list = cut_line.split('\t')  # 分词
            # print(word_list)
            for i in word_list:
                if i not in self.stop_word:
                    q_list.append(i)
            q_list = adjust_list(q_list, all_word)
            postags = postagger.postag(q_list)  # 词性标注
            pos_line = '\t'.join(postags)
            q_pos_list = pos_line.split('\t')
            netags = recognizer.recognize(q_list, postags)  # 命名实体识别
            ner_line = '\t'.join(netags)
            ner_list = ner_line.split('\t')
            # print(ner_list)
            q_ner = []
            ner_str = ''
            for nr in range(len(ner_list)):
                if ner_list[nr][0] != 'O':
                    if ner_list[nr][0] == 'S' or ner_list[nr][0] == 'E':
                        ner_str += q_list[nr]
                        q_ner.append(ner_str)
                        ner_str = ''
                    else:
                        ner_str += q_list[nr]
            arcs = parser.parse(q_list, q_pos_list)  # 句法分析
            arcs_line = "\t".join("%d %s" % (arc.head, arc.relation)
                                  for arc in arcs)
            arcs_list = arcs_line.split('\t')
            roles = labeller.label(q_list, postags, arcs)
            # print(q_list)
            for n in range(len(arcs_list)):
                # print(q_list[int(arcs_list[n].split()[0])-1],q_list[n],arcs_list[n].split()[1])
                if arcs_list[n].split()[1] == 'SBV':
                    q_v.append(q_list[int(arcs_list[n].split()[0]) - 1])
                    q_sbv.append(q_list[n])
                elif arcs_list[n].split()[1] == 'VOB':
                    q_v.append(q_list[int(arcs_list[n].split()[0]) - 1])
                    q_vob.append(q_list[n])
                elif arcs_list[n].split()[1] == 'IOB':
                    q_v.append(q_list[int(arcs_list[n].split()[0]) - 1])
                    q_vob.append(q_list[n])
                elif arcs_list[n].split()[1] == 'FOB':
                    q_vob.append(q_list[int(arcs_list[n].split()[0]) - 1])
                    q_v.append(q_list[n])
                elif arcs_list[n].split()[1] == 'ATT':
                    q_att1.append(q_list[int(arcs_list[n].split()[0]) - 1])
                    q_att2.append(q_list[n])
                # print(q_list[int(arcs_list[n].split()[0]) - 1], q_list[n], arcs_list[n].split()[1])

            a_list = []
            a_pos_list = []
            cut_line = '\t'.join(segmentor.segment(self.data[k][1]))
            word_list = cut_line.split('\t')  # 分词
            # print(word_list)
            for i in word_list:
                if i not in self.stop_word:
                    a_list.append(i)
            a_list = adjust_list(a_list, all_word)
            postags = postagger.postag(a_list)  # 词性标注
            pos_line = '\t'.join(postags)
            a_pos_list = pos_line.split('\t')
            netags = recognizer.recognize(a_list, postags)  # 命名实体识别
            ner_line = '\t'.join(netags)
            ner_list = ner_line.split('\t')
            # print(self.data[k][0],self.data[k][2],self.data[k][-1])
            # print(q_list)
            # print(q_pos_list)
            # print(a_list)
            # print(a_pos_list)
            ner_type = [
                'O', 'S-Nh', 'S-Ni', 'S-Ns', 'B-Nh', 'B-Ni', 'B-Ns', 'I-Nh',
                'I-Ni', 'I-Ns', 'E-Nh', 'E-Ni', 'E-Ns'
            ]
            # for i in range(len(a_list)):
            r_pos = get_answer_pos(a_list, self.data[k][2])
            for i in range(len(a_list)):
                str_f = ''
                # if a_list[i] == self.data[k][2]:
                #     str_f += '1 '
                # else:
                #     str_f += '0 '
                str_f += str(r_pos[i]) + ' '
                if a_list[i] in set(q_list):
                    str_f += '1:1 '
                if a_list[i] in set(q_ner):
                    str_f += '2:1 '
                if a_list[i] in set(q_sbv):
                    str_f += '3:1 '
                if a_list[i] in set(q_v):
                    str_f += '4:1 '
                if a_list[i] in set(q_vob):
                    str_f += '5:1 '
                if a_list[i] in set(q_att1):
                    str_f += '6:1 '
                if a_list[i] in set(q_att2):
                    str_f += '7:1 '
                if a_list[i] in set(self.pos):
                    str_f += '8:1 '
                if i > 1 and a_list[i - 1] in set(self.pos):
                    str_f += '9:1 '
                str_f += get_feature(self.data[k][-1], a_pos_list[i])
                if len(word_feature) != 0:
                    last_word = word_feature[-1].split()
                    # str_f += str(500 + int(last_word[-2][:-2])) + ':1 '
                    if int(last_word[-1][:-2]) < 500:
                        str_f += str(500 + int(last_word[-1][:-2])) + ':1 '
                    else:
                        str_f += str(500 + int(last_word[-2][:-2])) + ':1 '
                else:
                    last_word = ''
                # str_f += str(9 +(ner_type.index(ner_list[i]))) + ':1 '
                # word_feature.append(a_list[i] + ' ' + str_f)
                word_feature.append(str_f)
                # word_all.append(str_f[0] + ' ' + a_list[i])
                word_all.append(a_list[i])
            # print(self.data[k])
            word_group.append(str(len(a_list)))
            j += 1
            if j == 100:
                break
            if j % 1000 == 0:
                print(j)
            #     break
        with open('AC/qa_train.txt', 'w') as f1:
            for wf in word_feature:
                f1.write(wf)
                f1.write('\n')
        with open('AC/qa_group.txt', 'w') as f2:
            for wg in word_group:
                f2.write(str(wg))
                f2.write('\n')
        with open('AC/qa_words.txt', 'w') as f3:
            for wa in word_all:
                f3.write(str(wa))
                f3.write('\n')
Ejemplo n.º 14
0
def recognizer_initial():
    ner_model_path = os.path.join(get_config('ner', 'LTP_DATA_DIR'),
                                  'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`
    recognizer = NamedEntityRecognizer()  # 初始化实例
    recognizer.load(ner_model_path)  # 加载模型
    return recognizer
Ejemplo n.º 15
0
def person_location_entity(word_list):
    """
    利用ltp获取人物及地点实体
    :param word_list: 分词后的词语列表
    :return:返回实体字典,key为:Nh,Ns,Ni,value为列表
    """
    logging.info('enter person_location_entity...')
    ner_dic = {}
    ner = ''
    if len(word_list) == 0:
        return ner_dic
    MODEL_PATH = r'/home/yanlei/IdeaProjects/hotpot/ltp_model'
    pos_model_path = os.path.join(MODEL_PATH,
                                  'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
    ner_model_path = os.path.join(MODEL_PATH,
                                  'ner.model')  # 实体识别模型路径,模型名称为`ner.model`
    # 1.初始化实例
    logging.info('initilizing...')
    postagger = Postagger()
    recognizer = NamedEntityRecognizer()
    # 2.加载模型及字典
    logging.info('loading...')
    postagger.load(pos_model_path)
    recognizer.load(ner_model_path)
    # 3.词性标注,remove函数无返回值
    logging.info('postaging...')
    if word_list.count('\n') > 0:
        word_list.remove('\n')
    postags = postagger.postag(word_list)
    # 4.实体识别
    logging.info('recognizering...')
    netags = recognizer.recognize(word_list, postags)  # 命名实体识别
    # print ('\t'.join(netags))
    # 5.结果处理
    logging.info('result operating...')
    index = 0  # 词语索引
    for tag in netags:
        # 如果找不到-,直接越过
        if tag.find('-') == -1:
            continue
        # 以’-’分隔,前面为词语在实体中的位置(B 表示实体开始词,I表示实体中间词,E表示实体结束词,S表示单独成实体),
        # 后面为实体类型(人名(Nh)、地名(Ns)、机构名(Ni))
        position, type = tag.split('-')
        if position == 'S':
            ner_dic.setdefault(type, [])
            ner_dic[type].append(word_list[index])
        elif position == 'B':
            ner = word_list[index]
        elif position == 'I':
            ner += word_list[index]
        elif position == 'E':
            ner += word_list[index]
            ner_dic.setdefault(type, [])
            ner_dic[type].append(ner)
            ner = ''
        index += 1
    # 按规则过滤
    for type, value in ner_dic.items():
        ner_dic[type] = filter_entity(ner_dic.get(type), type)
    # print(ner_dic)
    logging.info('releasing...')
    postagger.release()  # 释放模型
    recognizer.release()  # 释放模型
    return ner_dic
Ejemplo n.º 16
0
def getRelation(paragraph):
    """
	paragraph: a list of string, each string is a sentence
	return: a list of relations and a dict which records the number of occurrence of differents DSNF
	"""
    relations = []
    dict_DSNF = {
        'num_DSNF1': 0,
        'num_DSNF2': 0,
        'num_DSNF3': 0,
        'num_DSNF7': 0,
    }

    segmentor = Segmentor()
    segmentor.load(os.path.join(MODELDIR, "cws.model"))
    postagger = Postagger()
    postagger.load(os.path.join(MODELDIR, "pos.model"))
    parser = Parser()
    parser.load(os.path.join(MODELDIR, "parser.model"))
    recognizer = NamedEntityRecognizer()
    recognizer.load(os.path.join(MODELDIR, "ner.model"))

    for iteration, sentence in enumerate(paragraph):

        sentence = SentenceSplitter.split(sentence)[0]

        words = segmentor.segment(sentence)
        # print("\t".join(words))

        postags = postagger.postag(words)
        # list-of-string parameter is support in 0.1.5
        # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"])
        # print("\t".join(postags))

        arcs = parser.parse(words, postags)

        # print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))

        netags = recognizer.recognize(words, postags)
        # print("\t".join(netags))

        # labeller = SementicRoleLabeller()
        # labeller.load(os.path.join(MODELDIR, "pisrl.model"))
        # roles = labeller.label(words, postags, arcs)
        # for role in roles:
        #     print(role.index, "".join(
        #             ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]))

        entityList = findEntities(netags)
        # print(entityList)
        entities = []
        for i in entityList:
            l = ''
            for j in i:
                l += words[j]
            entities.append(l)

        DSNF1_ret = DSNF1(arcs, entityList, words, netags)
        DSNF2_ret = DSNF2(arcs, entityList, words)
        DSNF3_ret = DSNF3(arcs, entityList, words, postags)
        DSNF7_ret = DSNF7(arcs, entityList, words)
        # print("DSNF1 result: ", DSNF1_ret)
        # print("DSNF2 result: ", DSNF2_ret)
        # print("DSNF3 result: ", DSNF3_ret)
        # print("DSNF7 result: ", DSNF7_ret)
        relation = []
        for r in DSNF1_ret:
            dict_DSNF['num_DSNF1'] += 1
            new_r = [r[0], r[2], r[1]]
            relation.append((new_r, sentence))
            relations.append((new_r, sentence))
        for r in DSNF2_ret:
            dict_DSNF['num_DSNF2'] += 1
            new_r = [r[0], r[2], r[1]]
            relation.append((new_r, sentence))
            relations.append((new_r, sentence))
        for r in DSNF3_ret:
            dict_DSNF['num_DSNF3'] += 1
            new_r = [r[0], r[2], r[1]]
            relation.append((new_r, sentence))
            relations.append((new_r, sentence))
        for r in DSNF7_ret:
            dict_DSNF['num_DSNF7'] += 1
            new_r = [r[0], r[2], r[1]]
            relation.append((new_r, sentence))
            relations.append((new_r, sentence))
        if len(relation) > 0:
            print("evaluate the " + str(iteration + 1) + "-th sentences")
            print("entities in " + str(iteration + 1) + "-th sentence : ",
                  entities)
            for one in relation:
                r = one[0]
                data = {'sentence': sentence, 'kg': [r[0], r[1], r[2]]}
                # print('r',r)
                key = get_key(data)
                old = DB.kg_mark.find_one({"_id": key})
                if old == None:
                    kg.mark_sentence(key, data)
                else:
                    print("已经存在跳过")
                    continue
                print(one)

                p, softmax = pre(data)
                print("with entities relation: ", r)
                print("预测:", p, "概率:", softmax)
                data['label'] = p
                data['state'] = '4'  #设置状态4独立开来
                print(data)

            # if len(relation)==3:
            # 	print("关系",relation[1],relation[2],relation[0])
            print("--" * 30)

    segmentor.release()
    postagger.release()
    parser.release()
    recognizer.release()
    # labeller.release()

    return relations, dict_DSNF
Ejemplo n.º 17
0
def get_ner(words, postags):
    recognizer = NamedEntityRecognizer()
    recognizer.load(ner_model_path)
    netags = recognizer.recognize(words, postags)
    recognizer.release()
    return list(netags)
Ejemplo n.º 18
0
def test_ltp():
    from pyltp import Segmentor
    segmentor = Segmentor()
    #segmentor.load('/Users/a000/Downloads/ltp-models/3.3.2/ltp_data.model')
    segmentor.load('/Users/a000/git/ltp_data/cws.model')
    words = segmentor.segment('元芳你怎么看')
    words = segmentor.segment('这本书很好, 我喜欢iphone, 1.5')
    words = segmentor.segment('张子萱怀孕了')
    words = segmentor.segment('我有一本书')
    words = segmentor.segment('今天是2017年3月30日, 清朝的官员')
    words = segmentor.segment('蚂蚁金服近日上市')
    words = segmentor.segment('国家主席习近平抵达美国佛罗里达州')
    words = segmentor.segment('独家|你想要的胸以下全是腿, 科切拉潮人用不')
    total_txt = '<a href=\"http://deeporiginalx.com/search.html#sw=%E7%AC%AC%E4%B8%80%E7%99%BD%E9%93%B6%E7%BD%91\" target=\"_blank\">第一白银网</a>4月19日讯<a href=\"http://deeporiginalx.com/search.html#sw=%E7%8E%B0%E8%B4%A7%E7%99%BD%E9%93%B6\" target=\"_blank\">现货白银</a>今日早盘走势受到美元反弹影响继续走软,目前交投于18.2一线,本周二美国总统特朗普再次提及税改政策,并且宣称将会以“迅雷不及掩耳之势”落地,据小编分析,税改落地将会利好美国经济,从而利好美元,打压白银走势,但问题是,3月份连医改都进展不顺,税改会通过吗?(<a href=\"http://deeporiginalx.com/search.html#sw=%E7%BC%96%E8%BE%91%E6%8E%A8%E8%8D%90%EF%BC%9A%E6%9C%AA%E6%9D%A5%E7%99%BD%E9%93%B6%E8%B5%B0%E5%8A%BF%E5%88%86%E6%9E%90\" target=\"_blank\"><strong><span>编辑推荐:未来白银走势分析</span></strong></a>'
    total_txt = "<span class=\"article_src\">游民星空</span>2017-04-09<span>阅读原文</span>"
    soup = BeautifulSoup(total_txt, 'lxml')
    total_txt = soup.get_text()
    print total_txt
    print type(total_txt)
    words = segmentor.segment(total_txt.encode('utf-8'))
    #words = segmentor.segment(s)
    for i in words:
        print i

    import jieba
    w_jieba = jieba.cut('独家|你想要的胸以下全是腿, 科切拉潮人用不')
    print '!!!!!'
    for i in w_jieba:
        print i

    from pyltp import Postagger
    poser = Postagger()
    poser.load('/Users/a000/git/ltp_data/pos.model')
    #words_pos = poser.postag(words)
    #for i in xrange(len(words_pos)):
    #    print words[i]
    #    print words_pos[i]

    s1 = '张继科:脚伤恢复七八成 现在不是想退役的时候'
    s2 = '张继科:脚伤恢复八成 现在还不是退役的时候'
    #s2 = '张继科和马龙:脚伤恢复八成 现在还不是退役的时候'
    s3 = '张继科:脚伤已恢复7-8成 现在还不是退役的时候'

    s4 = '国际乒联排名:马龙丁宁占据榜首 张继科第四'
    s5 = '国际乒联公布排名:马龙丁宁第一 张继科第四'

    s6 = '国家主席习近平抵达美国佛罗里达州'
    s7 = '习近平抵达美国佛罗里达州'

    s8 = '习近平抵达美国佛罗里达州 同特朗普会晤'
    s9 = '习近平抵达美国佛罗里达州 将与特朗普举行会晤'
    s10 = '习近平抵达美国 将同特朗普举行会晤'
    s11 = '习近平抵达美国佛罗里达州 将同特朗普举行中美元首会晤'

    s12 = '【V观】习近平引用芬兰谚语:没有人的开拓就不会有路'
    s13 = '习近平引用芬兰谚语:没有人的开拓就不会有路'

    s14 = '习近平就圣彼得堡地铁发生爆炸造成伤亡向普京致慰问电'  #
    s15 = '习近平就圣彼得堡地铁爆炸事件向普京致慰问电'  #15135383
    ss16 = '习近平就圣彼得堡市地铁发生爆炸造成严重人员伤亡向普京致慰问电'  #15130013
    ss17 = '习近平就圣彼得堡市地铁爆炸向普京致慰问电'  #15127277

    s16 = '习近平离京对芬兰进行国事访问并赴美国举行中美元首会晤'  #15131991
    s17 = '习近平离京对芬兰进行国事访问并赴美举行中美元首会晤'  #15132864
    s18 = '习近平离京对芬兰共和国进行国事访问并赴美国佛罗里达州举行中美元首会晤'  #15131971
    ws1 = segmentor.segment(s6)
    ws2 = segmentor.segment(s7)
    print '  '.join(ws1)
    print '  '.join(ws2)
    pos1 = poser.postag(ws1)
    pos2 = poser.postag(ws2)
    print ' '.join(pos1)
    print ' '.join(pos2)

    from pyltp import NamedEntityRecognizer
    reco = NamedEntityRecognizer()
    reco.load('/Users/a000/git/ltp_data/ner.model')
    ne1 = reco.recognize(ws1, pos1)
    ne2 = reco.recognize(ws2, pos2)
    print ' '.join(ne1)
    print ' '.join(ne2)

    from pyltp import Parser
    parser = Parser()
    parser.load('/Users/a000/git/ltp_data/parser.model')
    arc1 = parser.parse(ws1, pos1)
    arc2 = parser.parse(ws2, pos2)
    print ' '.join("%d:%s" % (arc.head, arc.relation) for arc in arc1)
    print ' '.join("%d:%s" % (arc.head, arc.relation) for arc in arc2)
Ejemplo n.º 19
0
    def extract_comment(self, article, say_words):
        """
        抽取言论
        :param news_path: 新闻路径
        :param say_words: similar to "say"
        :return:result:list[[person, say, comment],...]
        """
        # ltp路径
        LTP_DATA_PATH = '../ltp_data_v3.4.0'

        cws_model_path = os.path.join(LTP_DATA_PATH, 'cws.model')
        pos_model_path = os.path.join(LTP_DATA_PATH, 'pos.model')
        ner_model_path = os.path.join(LTP_DATA_PATH, 'ner.model')
        par_model_path = os.path.join(LTP_DATA_PATH, 'parser.model')

        postagger = Postagger()
        postagger.load(pos_model_path)
        print('Postagger loaded!')
        recognizer = NamedEntityRecognizer()
        recognizer.load(ner_model_path)
        print('NER loaded!')
        parser = Parser()
        parser.load(par_model_path)
        print('Parser loaded!')

        result = []
        sentences = self.cut_sentence(self.token(article))
        for s_index, sentence in enumerate(sentences):
            words = self.cut_word(sentence)
            pos = self.word_pos(sentence, postagger)
            ner_list = self.ner(words, pos, recognizer)
            parse_list = self.dependency_parse(words, pos, parser)
            if 'S-Nh' or 'S-Ni' or 'S-Ns' in ner_list:
                comment = ''
                for p_index, p in enumerate(parse_list):
                    # p[0]-1:说的索引(words,parse_list中都是)
                    # p_index:主语位置

                    if (p[1] == 'SBV') and words[p[0] - 1] in say_words:
                        say = words[p[0] - 1]
                        person = words[p_index]
                        p_i = 1
                        while p_i <= p_index and parse_list[p_index -
                                                            p_i][1] == 'ATT':
                            person = words[p_index - p_i] + person
                            p_i = p_i + 1
                        # 说后是。找前一句话的“”
                        if words[p[0]] == '。':
                            # print('说。')
                            i = 1
                            last_sentence = sentences[s_index - i]
                            last_words = self.cut_word(last_sentence)
                            begin = self.find_str_index(last_words, 0, ['“'])
                            end = self.find_str_index(last_words, 0, ['”'])
                            if begin != -1 and end != -1 and begin < end:
                                comment = ''.join(last_words[begin + 1:end])
                            else:
                                while begin == -1 and end != -1:
                                    i = i + 1
                                    last_sentence = sentences[s_index - i]
                                    last_words = self.cut_word(last_sentence)
                                    begin = self.find_str_index(
                                        last_words, 0, ['“'])
                                while i > 0:
                                    comment = comment + sentences[s_index - i]
                                    i = i - 1
                        else:
                            begin = self.find_str_index(words, p[0], ['“'])
                            end = self.find_str_index(words, p[0], ['”'])
                            if begin != -1 and end != -1 and parse_list[
                                    end - 1][0] == 'WP':
                                comment = ''.join(words[begin:end])
                            elif begin != -1 and end == -1:
                                comment = ''.join(words[begin:])
                                i = 1
                                next_sentence = sentences[s_index + i]
                                while end == -1:
                                    end = self.find_str_index(
                                        self.cut_word(next_sentence), 0, ['”'])
                                    i = i + 1
                                    if len(sentences) > s_index + i:
                                        next_sentence = sentences[s_index + i]
                                    else:
                                        break
                                comments = ''
                                while i > 1 and len(sentences) > s_index + i:
                                    comments = sentences[s_index +
                                                         i] + comments
                                    i = i - 1
                                comment = comment + comments

                            else:
                                # 说后面跟,或:
                                if words[p[0]] == ',' or words[
                                        p[0]] == ',' or words[p[0]] == ':':
                                    # print('说,')
                                    comment = ''.join(words[p[0] + 1:])
                                    # end = self.find_str_index(words, p[0] + 1, ['。', '!'])
                                    # if end != -1:
                                    #     comment = ''.join(words[p[0] + 1:end])
                                    # 说后跟宾语
                                elif parse_list[
                                        p[0]][1] == 'VOB' or parse_list[
                                            p[0]][1] == 'IOB':
                                    print('告诉谁')
                                    i = 0
                                    comment = ''.join(words[p[0] + 1:])
                                    # while len(comment) == 0:
                                    #     end = self.find_str_index(words, p[0] + i, [ '。', '!'])
                                    #     if end != -1:
                                    #         comment = ''.join(words[p[0] + i:end])
                                    #     i = i + 1
                                    # 说后面直接跟内容
                                else:
                                    comment = ''.join(words[p[0]:])
                                    # print('说内容')
                                    # end = self.find_str_index(words, p_index, [ '。', '!'])
                                    # if end != -1:
                                    #     comment = ''.join(words[p[0]:end])

                        print(parse_list)
                        # print(words[p[0]])
                        print(sentence)
                        print('[{}] [{}] [{}]'.format(person, say, comment))
                        print('-' * 50)
                        item = []
                        # item.append(person)
                        # item.append(say)
                        # item.append(comment)
                        result.append([person, say, comment])
                        # result.append(item)

        postagger.release()
        recognizer.release()
        parser.release()

        return result
Ejemplo n.º 20
0
class Ltp(LtpSegment):
    __model_dir = os.path.join('source', 'ltp_data_v3.4.0')

    # 词性标注
    postagger = Postagger()
    postagger.load(os.path.join(__model_dir, "pos.model"))

    # 命名实体识别
    recognizer = NamedEntityRecognizer()
    recognizer.load(os.path.join(__model_dir, "ner.model"))

    # 依存句法分析
    parser = Parser()
    parser.load(os.path.join(__model_dir, "parser.model"))

    # 语义角色标注
    labeller = SementicRoleLabeller()
    labeller.load(os.path.join(__model_dir, "pisrl.model"))

    def __init__(self):
        super().__init__()

    def postag(self, words):
        """
        词性标注
        :param input: 分词结果 list
        :return: 词性 list
        """
        postags = self.postagger.postag(words)
        return list(postags)

    def recognize(self, words, postags):
        """
        命名实体识别:
        1. LTP 采用 BIESO 标注体系:B表示实体开始词;I表示实体中间词;E表示实体结束词;
           S表示单独成实体;O表示不构成命名实体
        2. LTP 提供的命名实体类型为:人名(Nh);地名(Ns);机构名(Ni)
        3. B、I、E、S位置标签和实体类型标签之间用一个横线 - 相连;O标签后没有类型标签
        例如:
            S-Nh 表示单独一个词构成了人名。
        :param words: 分词结果 list
        :param postags: 词性标注结果 list
        :return: 命名实体标注结果 list
        """
        netags = self.recognizer.recognize(words, postags)
        return list(netags)

    def parse(self, words, postags):
        """
        依存句法分析
        :param words: 分词结果 list
        :param postags: 词性标注结果 list
        :return: ltp原生结果
            (arc.head, arc.relation) for arc in arcs
            ROOT节点的索引是0,第一个词开始的索引依次为1、2、3
            arc.relation 表示依存弧的关系。
            arc.head 表示依存弧的父节点词的索引,arc.relation 表示依存弧的关系。
        例:
        inputs:
            words = ['元芳', '你', '怎么', '看']
            postags = ['nh', 'r', 'r', 'v']
        output:
            4:SBV 4:SBV 4:ADV 0:HED
            输出格式为 head:relation
        """
        arcs = self.parser.parse(words, postags)
        return arcs

    def label(self, words, postags, arcs):
        """
        语义角色标注
        :param words: 分词结果 list
        :param postags: 词性标注结果 list
        :param arcs: 依存句法分析结果 ltp
        :return: ltp原生结果
            (arg.name, arg.range.start, arg.range.end) for arg in role.arguments
            第一个词开始的索引依次为0、1、2
            返回结果 roles 是关于多个谓词的语义角色分析的结果。由于一句话中可能不含有语义角色,所以
            结果可能为空。role.index 代表谓词的索引, role.arguments 代表关于该谓词的若干语义角
            色。arg.name 表示语义角色类型,arg.range.start 表示该语义角色起始词位置的索引,
            arg.range.end 表示该语义角色结束词位置的索引。
        例:
        inputs:
            words = ['元芳', '你', '怎么', '看']
            postags = ['nh', 'r', 'r', 'v']
            arcs 使用依存句法分析的结果
        output:
            3 A0:(0,0)A0:(1,1)ADV:(2,2)

            由于结果输出一行,所以“元芳你怎么看”有一组语义角色。
            其谓词索引为3,即“看”。
            这个谓词有三个语义角色范围分别是:
                (0,0)即“元芳”,(1,1)即“你”,(2,2)即“怎么”,类型分别是A0、A0、ADV。
        """
        roles = self.labeller.label(words, postags, arcs)
        return roles

    def get_name_entity(self, sentence, entity_type):
        """
        获取句子中特定的命名实体集
        :param sentence: 待分析句子
        :param entity_type: 待分析命名实体类型,可选值
        :return:
        """
        words = self.segment(sentence)
        postags = self.postag(words)
        ne_tags = self.recognize(words, postags)
        sentence_len = len(words)

        ret_entity = set()
        entity_pattern = ""
        for i in range(sentence_len):
            if (ne_tags[i] == 'B-' + entity_type) or (ne_tags[i]
                                                      == 'B-' + entity_type):
                entity_pattern += words[i]
            elif (ne_tags[i] == 'E-' + entity_type) or (ne_tags[i]
                                                        == 'S-' + entity_type):
                entity_pattern += words[i]
                ret_entity.add(entity_pattern)
                entity_pattern = ""

        return list(ret_entity)
Ejemplo n.º 21
0
def get_recognizer(LTP_DATA_DIR):
    ner_model_path = os.path.join(LTP_DATA_DIR,
                                  'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`
    recognizer = NamedEntityRecognizer()  # 初始化实例
    recognizer.load(ner_model_path)  # 加载模型
    return recognizer
Ejemplo n.º 22
0
def simlify(text):
    LTP_DATA_DIR = r'E:\anaconda\ltpmoxin\ltp_data'  # ltp模型目录的路径
    cws_model_path = os.path.join(LTP_DATA_DIR,
                                  'cws.model')  # 分词模型路径,模型名称为`cws.model`

    lexicon_path = os.path.join(LTP_DATA_DIR, 'lexicon')  # 分词词典lexicon

    segmentor = Segmentor()  # 初始化实例

    # segmentor.load(cws_model_path)  # 加载模型,如果不想自定义词典,就用这一句load模型即可

    segmentor.load_with_lexicon(cws_model_path,
                                lexicon_path)  # 加载模型,参数lexicon是自定义词典的文件路径

    words = segmentor.segment(text)  # 分词

    #print('|'.join(words))#打印分词结果

    pos_model_path = os.path.join(LTP_DATA_DIR,
                                  'pos.model')  # 词性标注模型路径,模型名称为`pos.model`

    postagger = Postagger()  # 初始化实例

    postagger.load(pos_model_path)  # 加载模型

    postags = postagger.postag(words)  # 词性标注,这里words是分词后的list

    #print(' | '.join(postags))

    postagger.release()  # 释放模型

    par_model_path = os.path.join(
        LTP_DATA_DIR, 'parser.model')  # 依存句法分析模型路径,模型名称为`parser.model`

    parser = Parser()  # 初始化实例

    parser.load(par_model_path)  # 加载模型

    arcs = parser.parse(words, postags)  # 句法分析
    parser.release()  # 释放模型
    #信息提取,结果展示

    rely_id = [arc.head for arc in arcs]  # 提取依存父节点id

    relation = [arc.relation for arc in arcs]  # 提取依存关系

    heads = ['Root' if id == 0 else words[id - 1]
             for id in rely_id]  # 匹配依存父节点词语

    #for i in range(len(words)):

    #print(relation[i] +'(' + words[i] +', ' + heads[i] +')')

    array = []
    for i in range(len(words)):
        dict = {}
        dict["dep"] = words[i]
        dict["gov"] = heads[i]
        dict["pos"] = relation[i]
        array.append(dict)
    return array

    ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model')
    recognizer = NamedEntityRecognizer()  # 初始化实例
    recognizer.load(ner_model_path)  # 加载模型
    netags = recognizer.recognize(words, postags)  # 命名实体识别
    #for word, ntag in zip(words, netags):
    #   print(word + '/' + ntag)
    recognizer.release()  # 释放模型
Ejemplo n.º 23
0
    # --------------------- 词性标注 ------------------------
    postagger = Postagger(os.path.join(MODELDIR, "pos.model"))
    postags = postagger.postag(words)
    # list-of-string parameter is support in 0.1.5
    # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"])
    print("\t".join(postags))

    # --------------------- 语义依存分析 ------------------------
    parser = Parser(os.path.join(MODELDIR, "parser.model"))
    arcs = parser.parse(words, postags)

    print("\t".join("%d:%s" % (head, relation) for (head, relation) in arcs))

    # --------------------- 命名实体识别 ------------------------
    recognizer = NamedEntityRecognizer(os.path.join(MODELDIR, "ner.model"))
    netags = recognizer.recognize(words, postags)
    print("\t".join(netags))

    # --------------------- 语义角色标注 ------------------------
    labeller = SementicRoleLabeller(os.path.join(MODELDIR, "pisrl_win.model"))
    roles = labeller.label(words, postags, arcs)

    for index, arguments in roles:
        print(
            index, " ".join([
                "%s: (%d,%d)" % (name, start, end)
                for (name, (start, end)) in arguments
            ]))

    segmentor.release()
Ejemplo n.º 24
0
def cal_sentiment_NER(df_text):
    """
    natural language processing on every row from the input.
    1. for loop dataframe:
    2. preprocess text in the df.
    3. get entity using pyLTP
    4. get sentiment, keywords, summary using SnowNLP.
    5. append result to df
    Keyword Arguments:
    df_text --
    """
    # 词性标注
    pos_model_path = os.path.join(LTP_DATA_DIR,
                                  'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
    postagger = Postagger()  # 初始化实例
    postagger.load(pos_model_path)  # 加载模型

    # 命名实体识别
    ner_model_path = os.path.join(LTP_DATA_DIR,
                                  'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`

    recognizer = NamedEntityRecognizer()  # 初始化实例
    recognizer.load(ner_model_path)  # 加载模型

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.DEBUG)

    if isinstance(df_text, gftIO.GftTable):
        df_text = df_text.as_mutable_column_tab()
    df_result = pd.DataFrame(columns=[
        'datetime', 'people', 'geography', 'organization', 'keyword',
        'summary', 'score'
    ])
    for item in df_text[:10].iterrows():
        #  print(item[1]['Conclusion'])
        logging.info(item[0])

        text = item[1]['Conclusion']
        datetime = item[1]['WritingDate']
        if not pd.isnull(text):
            text_split = preprocessing.preprocess_string(text)
            # 词性标注
            #            postagger = Postagger()  # 初始化实例

            words = text_split.split()  # 分词结果
            postags = postagger.postag(words)  # 词性标注
            netags = recognizer.recognize(words, postags)  # 命名实体识别

            dict_netags = defaultdict(list)
            ls_netags = list(zip(netags, words))
            for x, y in ls_netags:
                dict_netags[x].append(y)

            s = SnowNLP(text)
            score = s.sentiments * 2
            # # 人名(Nh)、地名(Ns)、机构名(Ni。)
            # # B、I、E、S
            ls_organization = [
                dict_netags[x] for x in ['S-Ni', 'B-Ni', 'E-Ni', 'I-Ni']
            ]
            ls_people = [
                dict_netags[x] for x in ['S-Nh', 'B-Nh', 'E-Nh', 'I-Nh']
            ]
            ls_geography = [
                dict_netags[x] for x in ['S-Ns', 'B-Ns', 'E-Ns', 'I-Ns']
            ]
            try:
                df_result = df_result.append(
                    {
                        'datetime':
                        datetime,
                        'keyword':
                        ','.join(s.keywords()),
                        'organization':
                        list(itertools.chain.from_iterable(ls_organization)),
                        'people':
                        list(itertools.chain.from_iterable(ls_people)),
                        'geography':
                        list(itertools.chain.from_iterable(ls_geography)),
                        'summary':
                        ';'.join(s.summary()),
                        'score':
                        score
                        # 'text': text,
                    },
                    ignore_index=True)
            except:
                continue
    return df_result
Ejemplo n.º 25
0
import json

config = None
with open("config.json", 'r') as load_f:
    config = json.load(load_f)

# print(config)
cws_model_path = config['model'] + '/cws.model'
pos_model_path = config['model'] + '/pos.model'
par_model_path = config['model'] + '/parser.model'
ner_model_path = config['model'] + '/ner.model'

#初始化
segmentor = Segmentor()  #分词
postagger = Postagger()  #词性标注
recognizer = NamedEntityRecognizer()  #命名主体识别
parser = Parser()  #依存分析

segmentor.load(cws_model_path)
print('Segmentor Model Loaded')
postagger.load(pos_model_path)
print('Postagger Model Loaded')
recognizer.load(ner_model_path)
print('Recognizer Model Loaded')
parser.load(par_model_path)
print('Parser Model Loaded')

say_words = [
    ':', '诊断', '交代', '说', '说道', '指出', '报道', '报道说', '称', '警告', '所说', '告诉', '声称',
    '表示', '时说', '地说', '却说', '问道', '写道', '答道', '感叹', '谈到', '说出', '认为', '提到',
    '强调', '宣称', '表明', '明确指出', '所言', '所述', '所称', '所指', '常说', '断言', '名言', '告知',
Ejemplo n.º 26
0
class LtpFormatter:
    model_dir = os.path.join("utils", "ltp_data_v3.4.0")
    # 注意这里的位置需要调整为运行位置到ltp的相对位置,或者设置为绝对位置

    segmentor = Segmentor()
    segmentor.load(os.path.join(model_dir, "cws.model"))

    postagger = Postagger()
    postagger.load(os.path.join(model_dir, "pos.model"))

    parser = Parser()
    parser.load(os.path.join(model_dir, "parser.model"))

    recognizer = NamedEntityRecognizer()
    recognizer.load(os.path.join(model_dir, "ner.model"))

    labeller = SementicRoleLabeller()
    labeller.load(os.path.join(model_dir, "pisrl.model"))

    def format_only_pos(self, sentence):
        results = {'basic': [], 'role': []}

        words = self.segmentor.segment(sentence)
        postags = self.postagger.postag(words)

        index = 0
        for word, postag in zip(words, postags):
            results['basic'].append({
                'index': index,
                'word': word,
                'pos': postag
            })
            index += 1

        return results

    def format(self, sentence):

        results = {'basic': [], 'role': []}

        words = self.segmentor.segment(sentence)
        postags = self.postagger.postag(words)
        arcs = self.parser.parse(words, postags)
        netags = self.recognizer.recognize(words, postags)
        roles = self.labeller.label(words, postags, arcs)

        index = 0
        for word, postag, arc, netag in zip(words, postags, arcs, netags):
            results['basic'].append({
                'index': index,
                'word': word,
                'pos': postag,
                'entity': netag,
                'head': arc.head - 1,
                'relation': arc.relation
            })
            index += 1

        for role in roles:
            relations = []

            for arg in role.arguments:
                relations.append({
                    'name': arg.name,
                    'start': arg.range.start,
                    'end': arg.range.end
                })

            results['role'].append({
                'trigger': words[role.index],
                'index': role.index,
                'relation': relations
            })

        return results

    def release(self):
        self.segmentor.release()
        self.postagger.release()
        self.parser.release()
        self.recognizer.release()
        self.labeller.release()
Ejemplo n.º 27
0
#segmentor.load(cws_model_path)  # 加载模型
segmentor.load_with_lexicon(cws_model_path,
                            'D:\python\ltp_data_v3.4.0\lexicon')
segmentor_2 = Segmentor()  # 初始化实例
# #segmentor.load(cws_model_path)  # 加载模型
segmentor_2.load_with_lexicon(
    cws_model_path, 'D:\python\ltp_data_v3.4.0\lexicon_label'
)  # 加载模型#segmentor.load_with_lexicon(cws_model_path, 'D:\python\毕业设计\lexicon')  # 加载模型,第二个参数是您的外部词典文件路径

postagger = Postagger()  # 初始化实例
postagger_2 = Postagger()  # 初始化实例
postagger.load_with_lexicon(pos_model_path,
                            'D:\python\ltp_data_v3.4.0\lexicon_1')  # 加载模型
postagger_2.load_with_lexicon(
    pos_model_path, 'D:\python\ltp_data_v3.4.0\lexicon_label_1')  # 加载模型
recognizer = NamedEntityRecognizer()  # 初始化实例
recognizer_2 = NamedEntityRecognizer()  # 初始化实例
recognizer.load(ner_model_path)  # 加载模型
recognizer_2.load(ner_model_path)  # 加载模型
parser = Parser()  # 初始化实例
parser.load(par_model_path)  # 加载模型
labeller = SementicRoleLabeller()  # 初始化实例
labeller.load(srl_model_path)  # 加载模型


def is_name_entity(entity):
    return entity != 'O'


def show_detail(sent):
    words = segmentor.segment(sent)  # 分词
Ejemplo n.º 28
0
    def __init__(self):
        self.LTP_DATA_DIR = '/Users/yf/Downloads/ltp_data_v3.4.0'
        # 自定义分词表
        self.cut_file = '/Users/yf/Downloads/ltp_data_v3.4.0/cut.txt'
        # 分词结果
        self.cut_list = []
        # 依存关系
        self.arcs = None
        # 词性
        self.part_speech_list = []
        # 分词
        self.segmentor = Segmentor()
        self.segmentor.load_with_lexicon(
            os.path.join(self.LTP_DATA_DIR, 'cws.model'), self.cut_file)
        # 词性标注
        self.postagger = Postagger()
        self.postagger.load(os.path.join(self.LTP_DATA_DIR, 'pos.model'))
        # 命名实体识别
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(self.LTP_DATA_DIR, 'ner.model'))
        # 依存句法分析
        self.parser = Parser()
        self.parser.load(os.path.join(self.LTP_DATA_DIR, 'parser.model'))
        # 语义角色标注
        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(self.LTP_DATA_DIR, 'pisrl.model'))

        # 词性标注集
        self._dict = {
            "a": "形容词",
            "ni": "机构名称",
            "b": "其他名词修饰语",
            "nl": "位置名词",
            "c": "连词",
            "ns": "地名",
            "d": "副词",
            "nt": "时态名词",
            "e": "感叹",
            "nz": "其他专有名词",
            "g": "词素",
            "o": "拟声词",
            "h": "字首",
            "p": "介词",
            "i": "成语",
            "q": "数量",
            "j": "缩写",
            "r": "代词",
            "k": "后缀",
            "u": "辅助的",
            "m": "数",
            "v": "动词",
            "n": "一般名词",
            "wp": "标点",
            "nd": "方向名词",
            "ws": "外来词",
            "nh": "人名",
            "x": "最小意义单位"
        }
        # 依存句法关系
        self._dict2 = {
            "SBV": "主谓关系",
            "VOB": "动宾关系",
            "IOB": "间宾关系",
            "FOB": "前置宾语",
            "DBL": "兼语",
            "ATT": "定中关系",
            "ADV": "状中结构",
            "CMP": "动补结构",
            "COO": "并列关系",
            "POB": "介宾关系",
            "LAD": "左附加关系",
            "RAD": "右附加关系",
            "IS": "独立结构",
            "HED": "核心关系"
        }
        # 命名实体识别标注集
        self._idct3 = {
            "O": "这个词不是NE",
            "S": "这个词单独构成一个NE",
            "B": "这个词为一个NE的开始",
            "I": "这个词为一个NE的中间",
            "E": "这个词位一个NE的结尾"
        }
        self._dict4 = {"Nh": "人名", "Ni": "机构名", "Ns": "地名"}
        # 语义角色类型
        self._dict5 = {
            "ADV": "默认标记",
            "BNE": "受益人",
            "CND": "条件",
            "DIR": "方向",
            "DGR": "程度",
            "EXT": "扩展",
            "FRQ": "频率",
            "LOC": "地点",
            "MNR": "方式",
            "PRP": "目的或原因",
            "TMP": "时间",
            "TPC": "主题",
            "CRD": "并列参数",
            "PRD": "谓语动词",
            "PSR": "持有者",
            "PSE": "被持有"
        }
Ejemplo n.º 29
0
            elif opt in ('-p', '--pfile'):
                options['pfile'] = arg
            elif opt in ('-m', '--master'):
                options['master'] = arg
            elif opt in ('-t', '--target'):
                options['target'] = arg
            elif opt in ('-c', '--capitalize'):
                options['capitalize'] = True
            elif opt == '-h':
                display_usage()
                sys.exit()
        slines = codecs.open(options['sfile'], 'r', 'utf-8').readlines()
        plines = codecs.open(options['pfile'], 'r', 'utf-8').readlines()

        postagger = Postagger()
        recognizer = NamedEntityRecognizer()

        if len(slines) != len(plines):
            print 'sfile lines must be equal pfile lines!!!'
            sys.exit()

        total_lines = len(slines)

        cpunum = min(total_lines, multiprocessing.cpu_count())
        cpunum = 5
        blines = int(math.ceil(total_lines / cpunum))
        i = 0
        while i < cpunum:
            sindex = int(i * blines)
            eindex = min((i + 1) * blines, total_lines)
            pw = Process(
Ejemplo n.º 30
0
Archivo: L2R.py Proyecto: Goerwa/QA
    def  get_vec(self,filename):
        def list2str(l):
            r_str = ' '
            for i in l:
                r_str += str(int(i.split(':')[0]) + 10)  +':'+ i.split(':')[1]+ ' '
            return r_str
        def adjust_list(l,words):
            l.insert(0,'"')
            n = len(l)
            cut_list = []
            cut_list.append(l[0])
            for i in range(1,n):
                a = ''
                if cut_list[-1]+l[i] in words:
                    cut_list[-1] = cut_list[-1]+l[i]
                    continue
                while l[i] not in words and len(l[i]) > 1:
                    a = l[i][-1] + a
                    l[i] = l[i][:-1]
                if l[i] in words:
                    cut_list.append(l[i])
                if a != '':
                    if a in words:
                        cut_list.append(a)
            # print(cut_list)
            return cut_list
        def get_tf_idf(data,syn_dict):
            all_words = []
            dict_freq = {}  # 词频
            dict_doc = {}  # 文档数量
            all_num = 0
            words = []
            stop_words = self.stop_word
            tf_idf = {}
            all_sent = []
            for k in data.keys():
                # print(data[k])
                line_list = (data[k].replace('[','').replace(']','').split('", '))
                line_list[-1] = line_list[-1][:-1]
                for i in line_list:
                    line_i = i.split('\t')[1] + '"'
                    all_sent.append(line_i)
                    cut_line = '\t'.join(segmentor.segment(line_i))
                    words_list = cut_line.split('\t')  # 分词
                    is_adddoc = []
                    for word in words_list:
                        if word not in stop_words:
                            if word not in dict_freq.keys():
                                dict_freq[word] = 1
                            else:
                                dict_freq[word] += 1
                            if word not in is_adddoc: # 词出现的问题树
                                if word not in dict_doc.keys():
                                    dict_doc[word] = 1
                                else:
                                    dict_doc[word] += 1
                                is_adddoc.append(word)
            for k in dict_freq.keys():
                idf = math.log(self.N / dict_doc[k])
                tf_idf[k] = 1 + math.log(dict_freq[k])
                tf_idf[k] *= idf
            with open('AC/tf-idf.txt','w') as fr:
                for k in tf_idf.keys():
                    fr.write(k)
                    fr.write('\t')
                    fr.write(str(tf_idf[k]))
                    fr.write('\n')

        def get_feature_vec(q_list,a_list):
            feature  = []
            q_den = 1
            for word in q_list:
                q_den += tf_idf[word]**2
            for sa_list in a_list:
                vec_f = 0
                a_den = 1
                for wa in sa_list:
                    a_den += tf_idf[wa]**2
                    if wa in set(q_list):
                        vec_f += tf_idf[wa]
                den = (q_den * a_den)**0.5
                vec_f /= den
                feature.append(round(vec_f*1000,2))
            return feature

        def get_feature_bm25(q_list,a_list,all_words):
            all_wordsl = list(all_words)
            # print(all_wordsl[174017])
            feature = []
            s = BM25(a_list,all_wordsl)
            # s.simall(q_list)
            # print(s.simall(q_list))
            for i in s.simall(q_list):
                feature.append(i)
            # print(feature)
            return feature

        # def get_feature_sim(q_list,a_list):
        #     feature = []
        #     str_q = ''
        #     for q in q_list:
        #         str_q  = str_q + ' ' + q
        #     for as_list in a_list:
        #         str_sa = ''
        #         for a in as_list:
        #             str_sa = str_sa + ' ' + a
        #         # print(q_list,as_list,synonyms.compare(q_list,as_list))
        #         if len(str_sa) < 1 or len(str_q) < 1:
        #             feature.append(0.0)
        #         else:
        #             feature.append(round(synonyms.compare(str_q, str_sa,seg=False)*1000,3))
        #     return feature

        def get_feature_same(q,a_list):
            r = []
            for sa_list in a_list:
                n = 0
                for a in sa_list:
                    if a in q:
                        n += 1
                r.append(n)
            return r
        def get_DA(words_list):
            postags = postagger.postag(words_list)  # 词性标注
            pos_line = '\t'.join(postags)
            pos_list = pos_line.split('\t')
            # print(pos_list)
            # print(pos_list)
            if pos_list == ['']:
                return []
            netags = recognizer.recognize(words_list, pos_list)  # 命名实体识别
            ner_line = '\t'.join(netags)
            ner_list = ner_line.split('\t')

            arcs = parser.parse(words_list, pos_list)  # 句法分析
            arcs_line = "\t".join("%d %s" % (arc.head, arc.relation) for arc in arcs)
            arcs_list = arcs_line.split('\t')
            r = []
            rsyn = []
            for i in range(len(arcs_list)):
                # print(words_list[int(arcs_list[i][0])-1] + '_' + words_list[i],arcs_list[i][0])
                if pos_list[i][0]  in set({'n','v','a'}):
                    r.append(words_list[int(arcs_list[i][0]) - 1] + '_' + words_list[i])
            return r
        def get_feature_DA(q_list,a_list):
            feature = []
            feature_q = get_DA(q_list)
            feature_a = []
            for sa_list in a_list:
                feature_sa = get_DA(sa_list)
                # print(feature_q)
                # print(feature_sa)
                score = 0.0
                n = 0
                for sa in feature_sa:
                    for q in feature_q:
                        n += 1
                        # print(sa,q)
                        if sa == q:
                            score += 1
                        elif sa.split('_')[0] == q.split('_')[0]:
                            score += 0.5
                        elif sa.split('_')[1] == q.split('_')[1]:
                            score += 0.5
                        else:
                            n -= 1
                # print(score,n)
                if score > 0.4:
                    feature.append(score/n)
                else:
                    feature.append(0.0)
            # print(feature)
            return feature

        fd = open(filename, 'r')
        data = []
        data_dict = {}
        for line in fd:
            # print(line[:-1])
            # print(line[:-1].split('\t')[1])
            data.append(line[:-1])
        for i in range(0,len(data),2):
            data_dict[data[i]] = data[i+1]

        segmentor = Segmentor()
        segmentor.load('cws.model')
        postagger = Postagger()  # 初始化实例
        postagger.load('pos.model')  # 加载模型
        recognizer = NamedEntityRecognizer()  # 初始化实例
        recognizer.load('ner.model')  # 加载模型
        parser = Parser()
        parser.load('parser.model')
        tf_idf = {}
        all_word = []
        answer_vec = []
        fti = open('AC/tf-idf.txt', 'r')
        for line in fti:
            k = line[:-1].split('\t')[0]
            v = line[:-1].split('\t')[1]
            tf_idf[k] = round(float(v), 2)
            all_word.append(k)
        all_word = set(all_word)
        j = 0
        for k in data_dict.keys():
            # print(k,data_dict[k])
            # print(j)
            pos_a = []
            cut_line = '\t'.join(segmentor.segment(k.split('\t')[1][1:-1]))
            words_list = cut_line.split('\t')  # 分词
            words_list = adjust_list(words_list, all_word)
            q_list = []
            a_list = []
            for word in words_list:
                if word not in self.stop_word:
                    q_list.append(word)
            line_list = (data_dict[k].replace('[', '').replace(']', '').split('", '))
            line_list[-1] = line_list[-1][:-1]
            for i in line_list:
                sa_list = []
                line_i = i.split('\t')[1] + '"'
                i_n = i.split('\t')[0]
                pos_a.append(i_n)
                cut_line = '\t'.join(segmentor.segment(line_i[1:-1]))
                words_list = cut_line.split('\t')  # 分词
                words_list = adjust_list(words_list, all_word)
                for word in words_list:
                    if word not in self.stop_word:
                        sa_list.append(word)
                a_list.append(sa_list)
            # print(q_list)
            # print(a_list)
            feature_same = get_feature_same(k.split('\t')[1][1:-1],a_list)
            feature_vec = get_feature_vec(q_list,a_list)
            feature_bm25 = get_feature_bm25(q_list,a_list,all_word)
            # feature_sim = get_feature_sim(q_list, a_list)
            feature_DA = get_feature_DA(q_list,a_list)

            for ni in range(len(pos_a)):
                answer_vec.append(pos_a[ni] + ' 1:' + str(feature_vec[ni]) + ' 2:' + str(feature_DA[ni]) + ' 3:' +
                                  str(feature_same[ni]) + list2str(feature_bm25[ni]))
            j += 1
            if j % 500 == 0:
                print(j)
            # if j == 5:
            #     break
        with open('AC/train.txt', 'w') as fw:
            for avec in answer_vec:
                fw.write(avec)
                fw.write('\n')