Python TextProcessの例、textProcess.TextProcess Pythonの例

コード例 #1

0

ファイルを表示

ファイル: mysqltest.py プロジェクト: shishi11/nlp

def genfast(rows,outfile):
    fou = open(outfile, 'w', encoding='UTF-8')
    # for file in f:
    # print(file)
    # domboj = xmldom.parse(file)
    # rows=domboj.getElementsByTagName("row")
    TextProcess()
    for row in rows:
        title = row[3]
        content = row[4]
        catalog = row[9]
        if catalog is None:
            catalog='other'
        elif catalog.find('扶贫') > -1:
            catalog = 'fupin'
        elif catalog.find('环保')>-1:
            catalog='huanbao'


        # catalog = 'other' if row[9] is None else 'huanbao'

        # print(title,catalog)
        line = title.strip() + "  " + content.strip()
        outline = TextProcess.doAll(line)
        outline = "\t__label__" + catalog + "  " + outline + "\t\n"
        fou.write(outline)
    fou.close()

コード例 #2

0

ファイルを表示

ファイル: mysqltest.py プロジェクト: shishi11/nlp

def NB():
    data = getTrainData("SELECT *"
                        + " FROM news_"
                        + " WHERE detail like '%s'", '%习近平%')
    TextProcess()
    X_train = [TextProcess.doAll(r[3] + r[4]) for r in data]

    y_train = [1 if r[9] == '环保' else 0 for r in data]

    vectorizer = CountVectorizer(min_df=1, max_df=1.0, token_pattern='\\b\\w+\\b')
    # vectorizer=TfidfVectorizer(token_pattern=,)
    X_count_train = vectorizer.fit_transform(X_train)

    mnb_count = MultinomialNB()
    svm_ = SVC(kernel='rbf')
    mnb_count.fit(X_count_train, y_train)
    svm_.fit(X_count_train, y_train)

    data1 = getTrainData("SELECT *"
                         + " FROM news_"
                         + " WHERE detail not like '%s'", '%习近平%')
    X_test = [TextProcess.doAll(r[3] + r[4]) for r in data1]
    X_count_test = vectorizer.transform(X_test)
    y_test = [1 if r[9] == '环保' else 0 for r in data1]

    y_predict = mnb_count.predict(X_count_test)
    y_predict1 = svm_.predict(X_count_test)
    print(classification_report(y_test, y_predict))  # , target_names=news.target_names))
    print(classification_report(y_test, y_predict1))

コード例 #3

0

ファイルを表示

ファイル: mysqltest.py プロジェクト: shishi11/nlp

def ft():
    saveDataFile = r'./resources/news_fasttext_all.txt'
    testFile=r'./resources/news_fasttext_环保扶贫1.txt'
    classifier=fasttext.supervised(saveDataFile, output='./mod/xjpnews_classifier_model3',dim=200,min_count=1,ws=10,epoch=150,neg=5,word_ngrams=2,bucket=1)
    # classifier = fasttext.load_model("./mod/xjpnews_classifier_model3.bin", encoding='utf-8', label_prefix='__lable__')
    # fasttext
    # fasttext.supervised().
    result = classifier.test(testFile)
    print("P@1:", result.precision)  # 准确率
    print("R@2:", result.recall)  # 召回率
    print("Number of examples:", result.nexamples)  # 预测错的例子
    texts='不谋全局者，不足谋一域。”2016年7月至今，以绿色税收等措施为发力点，政策力度持续增强。十九大报告亦指出，“必须树立和践行绿水青山就是金山银山的理念”“实行最严格的生态制度”。从经济学视角看环境治理，我们认为，“严监管”不仅有利于生态文明，更在三个层面牵动改革全局，促进中国经济转型升级。'
    texts1='水是指经济环境、制度环境；鱼是企业。他问如果“水”不好、中国的经济很差、中国不适合办企业，那么115家世界500强怎么来的？如果说“水”很好，那么为什么那么多“鱼”非正常死掉？今天很多的企业家在改革开放近40年里在这个国家赚了很多的钱，但他们移民了。2016年，美国的投资移民签了800个人，很多是咱们中国人。他们为什么要移民？这个焦虑是从何而来？这个问题在很多人的心目中仍是一个问号。'
    TextProcess()

    texts=[TextProcess.doAll(texts)]
    texts1=[TextProcess.doAll(texts1)]
    lables = classifier.predict_proba(texts,k=2)
    print(lables,texts)
    lables1 = classifier.predict_proba(texts1, k=2)
    print(lables1,texts1)

    # import xml.dom.minidom as xmldom
    # f = glob.glob('./resources/*.xml')
    # fou = open('./resources/xijinping_fasttext_predict.txt', 'w', encoding='UTF-8')
    # for file in f:
    #     print(file)
    #     domboj = xmldom.parse(file)
    #     rows=domboj.getElementsByTagName("row")
    #     for row in rows:
    #         title=row.getElementsByTagName("IR_URLTITLE")[0].firstChild.data
    #         content=row.getElementsByTagName("IR_CONTENT")[0].firstChild.data
    #         # catalog=row.getElementsByTagName("IR_CATALOG")[0].firstChild.data
    #         #标题
    #         line=title.strip()+"  "+content.strip()
    #         newsline=TextProcess.doAll(line)
    #         lables = classifier.predict([newsline])
    #         outline = "\t" + lables[0][0] + "  " + newsline + "\t\n"
    #         fou.write(outline)
    #
    # fou.close()

    # fou = open('./resources/news_fasttext_predict2.txt', 'w', encoding='UTF-8')
    # rows=query_all('%生态%')
    # for row in rows:
    #     title = row[3]
    #     content = row[4]
    #     line = title.strip() + "  " + content.strip()
    #     newsline = TextProcess.doAll(line)
    #     lables = classifier.predict_proba([newsline],k=2)
    #     outline = "\t__label__"
    #     fou.write(outline)
    #     if (len(lables) == 1):
    #         fou.write(lables[0][0][0]+str(lables[0][0][1]))
    #     if(len(lables[0])==2):
    #         fou.write(lables[0][1][0]+str(lables[0][1][1]))
    #     fou.write('   '+line + "\t\n")
    # fou.close()
    return

コード例 #4

0

ファイルを表示

ファイル: mysqltest.py プロジェクト: shishi11/nlp

def gensim():
    data = query_country_name('环保')
    # 词频统计
    TextProcess()
    # from collections import defaultdict
    # frequency = defaultdict(int)
    # for r in data:
    #    text= (TextProcess.doAll(r[3] + r[4])).split()
    #    for token in text:
    #         frequency[token] += 1
    # print(frequency)

    corpus = [(TextProcess.doAll(r[3] + r[4])).split() for r in data]
    dictionary = corpora.Dictionary(corpus)
    print(dictionary.dfs)

コード例 #5

0

ファイルを表示

ファイル: booksents.py プロジェクト: shishi11/nlp

    def prepare(self):
        f = glob.glob('./resources/book/*.txt')
        sents = []
        sents_ = []
        for file in f:
            with open(file, 'r', encoding='utf-8') as fin:
                lines = fin.readlines()
                bookname = lines[0].strip()
                for row in lines:
                    row = TextProcess.remove_noisy(row.strip())
                    if row == None or row == '' or len(
                            row) < 10 or row[-1] not in zhon.hanzi.punctuation:
                        continue
                    rowsents = self.splitter.split(row)
                    for s in rowsents:
                        sents.append([s, bookname])
                        sents_.append(s)
                logging.info(bookname + str(len(sents)) + '句')
        fou = open('./resources/book.bin', 'wb')
        pickle.dump(sents, fou)
        fou.close()
        annoyIndex = AnnoyIndex(768)
        encodes = self.bc.encode(sents_)

        for i, sent in enumerate(sents):
            encode = encodes[i]  #self.bc.encode([sent[1]])[0]
            annoyIndex.add_item(i, encode)
        annoyIndex.build(10)
        annoyIndex.save('./mod/book.mod')

コード例 #6

0

ファイルを表示

def getSim(doc):
    model = models.doc2vec.Doc2Vec.load('./mod/docvec1.model')
    infer = model.infer_vector(TextProcess.doAll(doc).split(' '))
    sims = model.docvecs.most_similar([infer], topn=5)
    for id, sim in sims:
        print(id, sim)
        #文章在哪儿
        row = query_(id)
        print(row)

コード例 #7

0

ファイルを表示

ファイル: booksents.py プロジェクト: shishi11/nlp

 def find_golden_org(self, content, threadhold=0.94):
     sents = list(self.splitter.split(content))
     sents = [
         TextProcess.delNum(TextProcess.remove_noisy(sent)).strip()
         for sent in sents
     ]
     sents_encode = self.bc.encode(sents)
     result = []
     for i, sencode in enumerate(sents_encode):
         sentindex, dis = self.annoyIndex.get_nns_by_vector(
             sencode, 1, include_distances=True)
         print(sents[i] + str(np.cos(dis[0])))
         if (np.cos(dis[0]) > threadhold):
             result.append({
                 'org': self.sents[sentindex[0]][1].strip(),
                 'subcontent': sents[i],
                 'score': np.cos(dis[0]),
                 'video': self.sents[sentindex[0]]
                 if self.sents[sentindex[0]] else {}
             })
     return result

コード例 #8

0

ファイルを表示

ファイル: chatbot.py プロジェクト: kbednars/Chatbot

    def main(self):
        print('Welcome to Chatbot')
        stream = open("parameters.txt", 'r')
        self.parameters = yaml.safe_load(stream)

        self.textProcess = TextProcess(self.parameters)

        if not os.path.isfile('chatbotModel.pkl'):
            input_context = Input(shape=(self.parameters['maxLength'], ),
                                  dtype="int32",
                                  name="input_context")
            embeddingLayer = Embedding(
                self.parameters['vocabularySize'],
                output_dim=self.parameters['embeddingOutputDim'],
                weights=[self.textProcess.embedding_matrix],
                input_length=self.parameters['maxLength'],
                trainable=True)
            embedding_context = embeddingLayer(input_context)

            layer = GlobalMaxPool1D()(embedding_context)
            layer = Dense(int(self.parameters['trainData'] / 2),
                          activation='relu')(layer)
            outputs = Dense(self.parameters['trainData'],
                            activation='softmax')(layer)

            self.model = Model(input=[input_context], output=[outputs])
            adam = Adam(lr=self.parameters['learningRate'])
            self.model.compile(loss="categorical_crossentropy", optimizer=adam)
            self.model.summary()
            self.model.fit(
                self.textProcess.inputSequences[:self.parameters['trainData']],
                self.textProcess.getDecoderOutputData(
                    self.textProcess.targetSequences[:self.
                                                     parameters['trainData']]),
                batch_size=self.parameters['batchSize'],
                epochs=self.parameters['numEpochs'])
            self.saveDataset()
        else:
            self.loadDataset()
        self.start_chatbot()

コード例 #9

0

ファイルを表示

ファイル: mysqltest.py プロジェクト: shishi11/nlp

def tfidf():
    data = query_country_name('环保')
    TextProcess()
    # step 1
    vectorizer = CountVectorizer(min_df=1, max_df=1.0, token_pattern='\\b\\w+\\b')
    corpus = [TextProcess.doAll(r[3] + r[4]) for r in data]

    transformer = TfidfTransformer()
    corpus_train = vectorizer.fit_transform(corpus)
    tfidf = transformer.fit_transform(corpus_train)
    words = vectorizer.get_feature_names()
    words = np.array(words)

    # transformer.fit() vectorizer.fit_transform(corpus)
    weight = tfidf.toarray()
    word_index = np.argsort(-weight)

    words_ = words[word_index]
    print(words_[:3][:3])

    for word in words:
        print(word)

コード例 #10

0

ファイルを表示

ファイル: disambiguation_test.py プロジェクト: shishi11/nlp

 def __init__(self):
     self.tp=TextProcess()
     #数据库管理，加载政要数据
     self.dataManager=DataManager()
     # self.political_person_dict=list()
     #改用aho形式进行存储，方便进行多模匹配。
     self.aho_policical_person=ahocorasick.Automaton()
     try:
         # load_file = open('./mod/political_person_dict.bin', 'rb')
         # self.political_person_dict = pickle.load(load_file)
         # logging.info('political_person_dict count %d' % (len(self.political_person_dict)))
         file = open('./mod/aho_policical_person.aho', 'rb')
         self.aho_policical_person = pickle.load(file)
         logging.info('aho_policical_person count %d' % (len(self.aho_policical_person)))
     except:
         pass
     self.detector=MultiSenDetect()
     #加载地名数据索引，用于判断词性为hs的是否是地名
     load_file = open('./mod/place_dict.bin', 'rb')
     self.place_dict = pickle.load(load_file)
     logging.info('place_dict count %d' % (len(self.aho_policical_person)))
     return

コード例 #11

0

ファイルを表示

ファイル: similarity.py プロジェクト: wo4wangle/theisAanlysis

def similarity(text1, text2):

    List_Text = TextProcess(text1,text2)
    if List_Text == 0:
        print("Error")

    load_mod = Doc2Vec.load(r"/home/prouse/test.model")

    load_mod.random.seed(0)
    a_vec = load_mod.infer_vector(List_Text[0], alpha=0.001, epochs=50)
    load_mod.random.seed(0)
    b_vec = load_mod.infer_vector(List_Text[1], alpha=0.001, epochs=50)

    print(similar(a_vec, b_vec))

コード例 #12

0

ファイルを表示

def genTxt():
    rows = query('%习近平%')
    textProcess = TextProcess()
    allcent = list()
    for row in rows:

        title = row[3]
        content = row[4]
        #这里应该按符号进行分隔，分成句，不要符号
        temp = textProcess.cut_sent(title, ',') + textProcess.cut_sent(
            content, ',')
        print(temp)

        allcent.extend([item for item in temp if len(item) > 9])
        allcent.extend(['\n'])
    fou = open('./resources/bert_pretrain.txt', 'w', encoding='UTF-8')
    for row in allcent:
        words = textProcess.cut(row)
        if (row != '\n'):
            fou.write(' '.join(list(words)) + '\n')
        else:
            fou.write('\n')
    fou.close()
    return

コード例 #13

0

ファイルを表示

def train(ids, docs):
    x_train = []
    for i, doc in enumerate(docs):
        x_train.append(
            models.doc2vec.TaggedDocument(TextProcess.doAll(doc).split(' '),
                                          tags=[ids[i]]))
    model = models.doc2vec.Doc2Vec(vector_size=200,
                                   window=10,
                                   min_count=5,
                                   workers=4,
                                   negative=4)
    model.build_vocab(x_train)
    #看来是一个量的问题，epochs要达到与vector_size差不多的大小，效果才能看得出来
    #之前看到网上说不稳定的问题，看起来就是
    model.train(x_train, total_examples=model.corpus_count, epochs=200)
    model.save('./mod/docvec1.model')

コード例 #14

0

ファイルを表示

ファイル: mysqltest.py プロジェクト: shishi11/nlp

def create_temp(cat_keyword):
    fou = open('./resources/news_fasttext_all.txt', 'w', encoding='UTF-8')
    con = connect_wxremit_db()
    # cur = con.cursor()
    # cur.execute('TRUNCATE table news_temp')

    sql_str = "SELECT * FROM news_ WHERE detail like '%s' " % '%习近平%'
    cur = con.cursor()
    cur.execute(sql_str)
    rows = cur.fetchall()
    for row in rows:
        flag = False
        for topic in cat_keyword:
            catalog=topic

            title = row[3]
            content = row[4]

            line = title.strip() + "  " + content.strip()
            outline = TextProcess.doAll(line)

            for keyword in cat_keyword[topic]:
                if( outline.find(keyword)>-1):
                    flag=True
                    break
            if flag:
                outline = "\t__label__" + catalog + "  " + outline + "\t\n"
                fou.write(outline)
                break
        if not flag:
            catalog='其它'
            outline = "\t__label__" + catalog + "  " + outline + "\t\n"
            fou.write(outline)



    cur.close()
    fou.close()
    con.close()

コード例 #15

0

ファイルを表示

ファイル: disambiguation_test.py プロジェクト: shishi11/nlp

class PersonDisambiguation():
    def __init__(self):
        self.tp=TextProcess()
        #数据库管理，加载政要数据
        self.dataManager=DataManager()
        # self.political_person_dict=list()
        #改用aho形式进行存储，方便进行多模匹配。
        self.aho_policical_person=ahocorasick.Automaton()
        try:
            # load_file = open('./mod/political_person_dict.bin', 'rb')
            # self.political_person_dict = pickle.load(load_file)
            # logging.info('political_person_dict count %d' % (len(self.political_person_dict)))
            file = open('./mod/aho_policical_person.aho', 'rb')
            self.aho_policical_person = pickle.load(file)
            logging.info('aho_policical_person count %d' % (len(self.aho_policical_person)))
        except:
            pass
        self.detector=MultiSenDetect()
        #加载地名数据索引，用于判断词性为hs的是否是地名
        load_file = open('./mod/place_dict.bin', 'rb')
        self.place_dict = pickle.load(load_file)
        logging.info('place_dict count %d' % (len(self.aho_policical_person)))
        return



    '''
    分辨政要人物，保存基本数据，生成政要人物对应百度数据字典
    '''
    def checkPersonBaike(self):
        rows=self.dataManager.query_sql("select * from psm_cityfather")
        persons=[]
        for row in rows:
            person=dict()
            person['id']=row[0]
            person['nationlity']=row[1]
            person['region']=row[2]
            person['cname']=row[3]
            person['duty']=row[7]
            persons.append(person)
        logging.info('persons count: %d' % len(persons))
        #使用消歧工具
        detector=MultiSenDetect()
        count=0
        persons_temp=self.political_person_dict
        bar=tqdm(persons)
        for person in bar:
            bar.set_description_str(person['cname'])
            # self.political_person_dict=list()
            for p in self.political_person_dict:
                if p['cname'] == person['cname'] and p['duty'] == person['duty']:
                    person['baikename'] = p['baikename']
                    person['baikeurl']=p['baikeurl']
                    person['baikeconcept']=p['baikeconcept']
                    person.update()
                    break
            if person.get('baikeconcept'):
                count = count + 1
                persons_temp.append(person)
                continue
            else:
                sent_embedding_res, wds_embedding_res=detector.detect_main(person['duty'],person['cname'])
                # print(sent_embedding_res)
                # print(wds_embedding_res)
                person['baikename']=wds_embedding_res[0][0]
                person['baikeurl']=detector.getConcept(person['baikename'])['link']
                person['baikeconcept']=detector.getConcept(person['baikename'])
                person.update()
                # pprint.pprint(person)

                count=count+1
                persons_temp.append(person)
            if count % 5==0:
                fou = open('./mod/political_person_dict.bin', 'wb')
                pickle.dump(persons_temp, fou)
                fou.close()
                detector.save_cache()
        detector.save_cache()
        fou = open('./mod/political_person_dict.bin', 'wb')
        pickle.dump(persons, fou)
        fou.close()

    # 服务器版的,完整化宾语，但下一句的后置宾语不能识别
    def complete_VOB_server(self, arcs,  word_index):

        word = arcs[word_index][1]
        prefix = ''
        postfix = ''
        for arc in arcs:
            if arc[5] == word_index and arc[2] < word_index:
                prefix += self.complete_VOB_server(arcs,  arc[2])
            if arc[5] == word_index and arc[2] > word_index:
                postfix += self.complete_VOB_server(arcs, arc[2])
        return prefix + word + postfix

    def findPerson(self,content):
        #1先分句
        sents=self.tp.cut_sentences(content)
        nrs=dict()
        geos=set()
        for sent in sents:
        #     nr=set(self.tp.posseg(sent,POS=['nr']))
        #     nrs=nrs.union(nr)
        # return nrs
            arcs=self.parseContent(sent)
            for arc in arcs:
                # 可能是人名了
                if arc[3]=='nh':
                    #从这里找到定中关键词，放进去
                    # nrs.add(arc[1])
                    prefix = ''
                    for arc_ in arcs:
                        if arc_[5] == arc[2] and arc_[2] < arc[2]:
                            prefix += self.complete_VOB_server(arcs, arc_[2])
                    # if prefix=='' :
                        # nrs[arc[1]] = [prefix]
                        # continue
                    pattern = r',|\.|/|;|\'|`|\[|\]|<|>|\?|:|"|\{|\}|\~|!|@|#|\$|%|\^|&|\(|\)|-|=|\_|\+|，|。|、|；|‘|’|【|】|·|！| |…|（|）'
                    prefix_list = re.split(pattern, prefix)
                    for prefix_ in prefix_list:
                        if nrs.get(arc[1]):
                            if prefix_ not in nrs.get(arc[1]) and prefix_!='':
                                nrs[arc[1]].append(prefix_)
                        else:
                            nrs[arc[1]]=[prefix_]
                if arc[3]=='ns':
                    if (self.place_dict.get(arc[1])):
                        geos.add(arc[1])

        return nrs,geos

    '''用LTP Server形成arcs和child_dict_list'''
    '''这部分可以有其它LTP工具代替'''
    def parser_main_ltpserver(self, sentence):
        url = 'http://192.168.1.101:8020/ltp'
        wb_data = requests.post(url, data={'s': sentence, 't': 'dp'}, json=True, allow_redirects=True)
        wb_data.encoding = 'utf-8'
        arcs_list = []
        try:
            content = wb_data.json()
            for c in content[0][0]:
                p = c.get('parent')
                pc = content[0][0][p]
                pname = pc.get('cont')
                ppos = pc.get('pos')
                arcs_list.append(
                    [c.get('relate'), c.get('cont'), c.get('id'), c.get('pos'), pname, c.get('parent'), ppos])

            child_dict_list = []
            for index in range(len(content[0][0])):
                child_dict = dict()
                for arc_index in range(len(arcs_list)):
                    # if arcs[arc_index].relation=='HED':
                    #     print('hed')
                    if arcs_list[arc_index][5] == index:  # arcs的索引从1开始---->把HED去掉了
                        if arcs_list[arc_index][0] in child_dict:
                            child_dict[arcs_list[arc_index][0]].append(arc_index)
                        else:
                            child_dict[arcs_list[arc_index][0]] = []
                            child_dict[arcs_list[arc_index][0]].append(arc_index)
                child_dict_list.append(child_dict)
        except:
            None
        return arcs_list, child_dict_list

    def parseContent(self,sent):
        arcs, child_dict_list = self.parser_main_ltpserver(sent)
        return arcs

    def test1(self):
        load_file = open('./mod/political_person_dict.bin', 'rb')
        political_person_dict = pickle.load(load_file)
        # pprint.pprint(political_person_dict)
        for i, person in enumerate(political_person_dict):
            if person['cname']=='哈勒特马·巴特图勒嘎':
                pprint.pprint(person)
                pprint.pprint(i)
                break
    '''
    更新political_person_dict的数据，而不全重新生成
    '''
    def update_political_person_dict(self,cname,duty):
        load_file = open('./mod/political_person_dict.bin', 'rb')
        political_person_dict = pickle.load(load_file)
        for i, person in enumerate(political_person_dict):
            if person['cname']==cname and person['duty']==duty:
                sent_embedding_res, wds_embedding_res = self.detector.detect_main(person['duty'], person['cname'],[person['duty']])
                # print(sent_embedding_res)
                # print(wds_embedding_res)
                person['baikename'] = wds_embedding_res[0][0]
                person['baikeurl'] = self.detector.getConcept(person['baikename'])['link']
                person['baikeconcept'] = self.detector.getConcept(person['baikename'])
                person.update()
                pprint.pprint(person)
        fou = open('./mod/political_person_dict.bin', 'wb')
        pickle.dump(political_person_dict, fou)
        fou.close()

    '''
    利用百分点服务，得到同义词，用于对齐
    '''
    def get_sim(self, something):
        url = 'http://10.122.141.12:9006/similar'
        r = requests.post(url, json={"ck": "synonym", "synonym_word": something, "synonym_selectedMode": "auto",
                                          "homoionym_word": "", "homoionym_selectedMode": "auto", "homoionym_num": ""})
        json = r.json()
        result = json['detail']['res']['synonym']
        return result

    '''
    生成模匹配索引，也可以用dict来代替，
    实际没有真正用模式匹配来取得人名，而是用LTP词性识别来做的，这样准确度比较好。
    '''
    def genAhocorasick(self):
        load_file = open('./mod/political_person_dict.bin', 'rb')
        self.political_person_dict = pickle.load(load_file)
        self.aho_policical_person=ahocorasick.Automaton()
        for i,person in enumerate(self.political_person_dict):
            word=person.get('cname')
            #这里发现要有外国人名对齐功能，唐纳德·特朗普===》特朗普、川普   习近平---》习主席，
            #但大部分中国人名，不需要对齐，
            aliasPerson = self.get_sim(word)
            baidualias=person.get('baikeconcept').get('别名')
            if word.find('·')>-1:
                aliasPerson.append(word[word.index('·')+1:])
                aliasPerson.append(word[word.rindex('·')+1:])
                #去掉中间名
                aliasPerson.append(word[word.index('·') + 1:]+word[word.rindex('·'):])

            baidualias_list=[]
            if baidualias:
                pattern = r',|\.|/|;|\'|`|\[|\]|<|>|\?|:|"|\{|\}|\~|!|@|#|\$|%|\^|&|\(|\)|-|=|\_|\+|，|。|、|；|‘|’|【|】|·|！| |…|（|）'
                baidualias_list = re.split(pattern, baidualias)
            person_all = set([word]).union(set(aliasPerson)).union(set(baidualias_list))
            for word_ in person_all:
                persons=[]
                if self.aho_policical_person.exists(word_):
                    persons=self.aho_policical_person.get(word_)
                persons.append(person)
                self.aho_policical_person.add_word(word_,persons)
        self.aho_policical_person.make_automaton()
        # s=self.aho_policical_person.get('习近平')
        # pprint.pprint(s)
        out=open('./mod/aho_policical_person.aho','wb')
        out.write(pickle.dumps(self.aho_policical_person))
        out.close()

    def testAho(self):
        sent='本院受理的原告易纲诉被告吴勇、王国珍机动车交通事故责任纠纷一案，现已审理终结。判决如下：一、自本判决生效之日起三日内，王国珍赔偿杨旭维修费11703元；二、驳回杨旭的其他诉讼请求。因你下落不明，现依法向你公告送达本案的民事判决书。自本公告发出之日起，经过60日即视为送达。如不服本判决，可在判决书送达之日起十五日内，向本院递交上诉状，并按对方当事人的人数提出副本，上诉于广州市中级人民法院。特此公告。'
        file=open('./mod/aho_policical_person.aho','rb')
        aho_policical_person=pickle.load(file)
        for word in aho_policical_person.iter('刘惠'):
            pprint.pprint(word)
    '''
    识别文本中的政要人物
    repeat:是否要对每一个名字（即使文章中多次出现）进行识别
    att_weight:是否要进行人物头衔的加权
    geo_weight:是否要进行地理位置的加权
    '''
    def recongnizePoliticalPerson(self,sent,repeat=False,att_weight=True,geo_weight=True):
        pperon_candidates=[]
        pperson_sure=[]
        npperson_sure=[]
        #一句话中也可能有多个政要人名，多个重名怎么办，这种模式下会对有重复字的名字进行抽取
        # 如果有两个字的政要，恰好有三个字的其它人员，则会出现误判，所以最合理的
        # 方式仍然是利用分词和句法分析来定中分析。
        # 要先进行词法分析才行，这里用了LTP的server来做的，jieba的不准确，需要启动ltp_server
        nrs,geos=self.findPerson(sent)
        # for word in self.aho_policical_person.iter(sent):
        for nr in nrs:
            if not self.aho_policical_person.exists(nr):#只处理政要名字，以及与政要重名的名字，其它人名不处理
               continue
            ppersons=self.aho_policical_person.get(nr)#此处已包括重名政要，但不包括非政要

            #一句话里出现多次名字，只取一次，提高效率
            flag=True
            if not repeat:
                for pperon_candidate in pperon_candidates:
                    if pperon_candidate.get('cname')==ppersons[0].get('cname'):
                        flag=False
            if not flag: continue
            pperon_candidates=pperon_candidates+ppersons
            #把定中的关键词加权给到判断过程中
            att = []
            if att_weight:
                att = nrs.get(nr)
            #地理位置加权
            geo=[]
            if geo_weight:
                # geo=self.geoKG.parseDoc_global(sent)
                geo=geos
            # sent_embedding_res暂时无用，顺接原来的接口，
            # 能不能在这里加一个类别的判断呢，通过title判断是否是官员，再对官员进行过滤？
            # ATT与全句相比，更加贴近人物本身，其它关键词是背景
            # 应该先进行类似知识图谱级的判断，再进行消歧综合判断
            sent_embedding_res, wds_embedding_res = self.detector.detect_main(sent, ppersons[0].get('cname'), att, geo)
            concept=self.detector.getConcept(wds_embedding_res[0][0])#拿回元数据
            for pperson in ppersons:
                #政治人物 是百度给人物打的标签，这里为加强准确性，判断是否符合标签
                if concept.get('出生日期')==pperson.get('baikeconcept').get('出生日期'):# and '政治人物' in concept.get('tags'):
                    # logging.info(pperson)
                    # pprint.pprint(pperson)
                    pperson_sure.append(pperson)
                    break
            if pperson not in pperson_sure:
                    concept['是否政要']='否'
                    # pprint.pprint(concept)
                    npperson_sure.append(concept)
        #保存baidu的访问缓存
        self.detector.save_cache()
        pprint.pprint(pperson_sure)
        pprint.pprint(npperson_sure)
        return pperson_sure,npperson_sure
    def recongnizePerson(self,sent,repeat=False,att_weight=True,geo_weight=True):
        pperon_candidates = []

        # 一句话中也可能有多个政要人名，多个重名怎么办，这种模式下会对有重复字的名字进行抽取
        # 如果有两个字的政要，恰好有三个字的其它人员，则会出现误判，所以最合理的
        # 方式仍然是利用分词和句法分析来定中分析。
        # 要先进行词法分析才行，这里用了LTP的server来做的，jieba的不准确，需要启动ltp_server
        nrs, geos = self.findPerson(sent)
        # for word in self.aho_policical_person.iter(sent):
        for nr in set(nrs):
            att = []
            if att_weight:
                att = nrs.get(nr)
            # 地理位置加权
            geo = []
            if geo_weight:
                # geo=self.geoKG.parseDoc_global(sent)
                geo = geos
            # sent_embedding_res暂时无用，顺接原来的接口，
            # 能不能在这里加一个类别的判断呢，通过title判断是否是官员，再对官员进行过滤？
            # ATT与全句相比，更加贴近人物本身，其它关键词是背景
            # 应该先进行类似知识图谱级的判断，再进行消歧综合判断
            sent_embedding_res, wds_embedding_res = self.detector.detect_main(sent, nr, att, geo)
            concept = self.detector.getConcept(wds_embedding_res[0][0])  # 拿回元数据
            pperon_candidates.append(concept)
        pprint.pprint(pperon_candidates)

コード例 #16

0

ファイルを表示

ファイル: chatbot.py プロジェクト: kbednars/Chatbot

class Chatbot:
    def __init__(self):
        self.parameters = None
        self.textProcess = None
        self.model = None

    def main(self):
        print('Welcome to Chatbot')
        stream = open("parameters.txt", 'r')
        self.parameters = yaml.safe_load(stream)

        self.textProcess = TextProcess(self.parameters)

        if not os.path.isfile('chatbotModel.pkl'):
            input_context = Input(shape=(self.parameters['maxLength'], ),
                                  dtype="int32",
                                  name="input_context")
            embeddingLayer = Embedding(
                self.parameters['vocabularySize'],
                output_dim=self.parameters['embeddingOutputDim'],
                weights=[self.textProcess.embedding_matrix],
                input_length=self.parameters['maxLength'],
                trainable=True)
            embedding_context = embeddingLayer(input_context)

            layer = GlobalMaxPool1D()(embedding_context)
            layer = Dense(int(self.parameters['trainData'] / 2),
                          activation='relu')(layer)
            outputs = Dense(self.parameters['trainData'],
                            activation='softmax')(layer)

            self.model = Model(input=[input_context], output=[outputs])
            adam = Adam(lr=self.parameters['learningRate'])
            self.model.compile(loss="categorical_crossentropy", optimizer=adam)
            self.model.summary()
            self.model.fit(
                self.textProcess.inputSequences[:self.parameters['trainData']],
                self.textProcess.getDecoderOutputData(
                    self.textProcess.targetSequences[:self.
                                                     parameters['trainData']]),
                batch_size=self.parameters['batchSize'],
                epochs=self.parameters['numEpochs'])
            self.saveDataset()
        else:
            self.loadDataset()
        self.start_chatbot()

    def start_chatbot(self):
        while True:
            question = input('You: ')
            if question == '' or question == 'exit':
                break
            answer = self.decode_sequence(question)

            print('Bot: ' + format(answer))
            print()

    def decode_sequence(self, input_seq):
        target_seq = self.textProcess.getSentenceTokens(input_seq)
        states_value = self.model.predict(target_seq)
        sampled_token_index = np.argmax(states_value[0, :])
        decoded_sentence = self.textProcess.targetData[sampled_token_index]
        return decoded_sentence

    def saveDataset(self):
        with open('chatbotModel.pkl', 'wb') as handle:
            data = {'model': self.model}
            pickle.dump(data, handle, -1)

    def loadDataset(self):
        with open('chatbotModel.pkl', 'rb') as handle:
            data = pickle.load(handle)
            self.model = data['model']