Esempio n. 1
0
def data_pre_train_mongo_check_pet( ):
    """
    检查文章是不是pet
    from=0  #文章开始id
    limit=10 # 返回文章数目
    >>>data_pre_train(from=0, limit=10)


    """
    #检查宠物内容
    petclass = classify(model_name_or_path='tkitfiles/petclass/',num_labels=2,device='cuda')
    i=0
    n=0
    data=[]
    for item in get_one():

        # print(item)
        if DB.content_pet.find_one({"_id":item['_id']}):
            pass
        else:
            i=i+1
            txt=item['title']+item['content']
            p=petclass.pre(txt[:500])
            item['class']=p
            if p==1:
                n=n+1
            try:
                DB.content_pet.insert_one(item)
                pass
            except:
                pass
            if i%100==0:
                print(n/i,i)
Esempio n. 2
0
def data_pre_train_mongo_to_json( data_path='data/data.json',train_path='data/train_db_Summary.txt' ):
    """
    将文章转化为json格式
    from=0  #文章开始id
    limit=10 # 返回文章数目
    >>>data_pre_train(from=0, limit=10)
 
 
    """
    #检查宠物内容
    petclass = classify(model_name_or_path='tkitfiles/petclass/',num_labels=2,device='cuda')
    i=0
    data=[]
    for item in get_one():
        i=i+1
        # print(item)
        txt=item['title']+item['content']
        p=petclass.pre(txt[:500])
        item['class']=p
        if p==1:
            data.append(item)
        if i%100==0:
            print(i)
        if i%10000==0:
            print(i)
            try:
                add_data(data)
                data=[]
            except :
                pass

    add_data(data)
Esempio n. 3
0
def dev_terry():
    file_path = "dataset/terry/dev.json"
    tjosn = tkit.Json(file_path=file_path).auto_load()
    n = 0
    all = 1
    data = []
    xs = []
    ys = []
    # 生成画布
    plt.figure(figsize=(8, 6), dpi=80)
    # 打开交互模式
    plt.ion()

    for item in tqdm(tjosn):

        # text="我把小狗宠坏了,现在的小狗已经长大,一直追着兔子跑!"
        text = item['sentence']
        tclass = classify(model_name_or_path='outputs/terry_output')
        p = tclass.pre(text)
        if p < 3:
            print(item)
            print(p)
            # print("预测结果",tclass.pre(text))
            all = all + 1
            if tclass.pre(text) == item["label"]:
                n = n + 1
            # print("总共预测",all)
            # print("准确数目",n)
            # print("准确率",n/all)
        # data.append((all,n))
        xs.append(all)
        ys.append(n / all)
        if all % 10 == 0:
            # 清除原有图像
            plt.cla()

            # 设定标题等
            # plt.title("动态曲线图", fontproperties=myfont)
            plt.grid(True)
            plt.plot(xs, ys)
            # 暂停
            plt.pause(0.1)
            plt.show()
        #     plot_dev(xs,ys)

    # plot_dev(xs,ys)
    print("####" * 30)
    print("总共预测", all)
    print("准确数目", n)
    print("准确率", n / all)

    # 关闭交互模式
    plt.ioff()
    # 图形显示
    plt.show()
Esempio n. 4
0
def dev():
    file_path = "dataset/terry_r_rank/dev.json"
    tjosn = tkit.Json(file_path=file_path).auto_load()
    n = 0
    all = 1
    data = []
    xs = []
    ys = []
    # 生成画布
    plt.figure(figsize=(8, 6), dpi=80)
    # 打开交互模式
    plt.ion()

    for item in tqdm(tjosn):

        # text="我把小狗宠坏了,现在的小狗已经长大,一直追着兔子跑!"
        text = item['sentence']
        # tclass=classify(model_name_or_path='prev_trained_model/terry_rank_output')
        tclass = classify(model_name_or_path='outputs/terry_r_rank/',
                          num_labels=1,
                          device='cuda')
        p = tclass.pre_r(text)
        print(item["label"], p.tolist()[0][0])
    Ner.args['albert_path'] = "tkitfiles/ner"
    Ner.args['albert_embedding'] = 312
    Ner.args['rnn_hidden'] = 400

    Ner.model_version = 'ner'
    Ner.args['max_length'] = 200
    Ner.setconfig()
    return Ner


TNer = get_tner()

Ner = get_ner()
P = get_p()

Tclass = classify(model_name_or_path='tkitfiles/checkkg', device='cpu')
# 检查是不是知识
Check_kg = classify(model_name_or_path='../tdata/albert_check_kg',
                    device='cpu')
# check_pet=classify(model_name_or_path='../tdata/albert-chinese-pytorch-pet')

# ie=tkitNlp.TripleExtractor()

# 基于已存在的词典获取关系词
Terry_er = TEntityRel()

ttht = tkitText.Text()
ttht.load_ht()
HT = ttht.ht

#这里定义mongo数据
Esempio n. 6
0
from elasticsearch import helpers
from tkitTranslator import Translator
# import tkitJson
import pymongo
# 引入安装的库
import Bert_clear_title
from albert_pytorch import classify
"""
本脚本用于处理之前的内容


"""

#加载判断质量
tclass = classify(
    model_name_or_path="/mnt/data/dev/model/classification-text-good-bad/model",
    num_labels=2,
    device="cpu")

#模型下载自https://www.kaggle.com/terrychanorg/bertcleartitlemodel
TClear = Bert_clear_title.Marker(
    model_path="/mnt/data/dev/model/Bert_clear_title/model/")
TClear.load_model()

client = pymongo.MongoClient("localhost", 27017)
DB = client.hugo
OldDB = client.gpt2Write
items = []
for i, resp in enumerate(OldDB.content_pet.find({})):
    # print(resp)
    # continue
    # if i >100:
Esempio n. 7
0
from __future__ import unicode_literals
import sys
# sys.path.append("../")
import sys, os
import argparse
from utils import divideSentences, readFile, readDir
from pyltp import SentenceSplitter, Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller

from mark import *
from TEntityRel import *
from config import *

from albert_pytorch import classify
# tfile=tkitFile.File()
Tclass = classify(
    model_name_or_path=
    '/mnt/data/dev/github/open-entity-relation-extraction关系提取/open-entity-relation-extraction/关系判断/tkitfiles/checkkg'
)


def pre(data):
    """
    获取预测结果
    """
    # tkg = "[kg] "+",".join(data['kg'])+" [/kg] "+data['sentence']
    data['sentence_b'] = ",".join(data['kg'])
    p = Tclass.pre(data['sentence'], data['sentence_b'])
    softmax = Tclass.softmax()
    Tclass.release
    # print("分类", "|", '概率')
    # pre = []
    # for ck, rank in zip([1, 2], softmax):
Esempio n. 8
0
import tkitFile
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
from elasticsearch_dsl import Q
from config import *
from tqdm import tqdm
from albert_pytorch import classify

tclass = classify(model_name_or_path='../model/goodorbad/',
                  num_labels=2,
                  device='cpu')

import time


def search_content(keyword):
    client = Elasticsearch()
    q = Q("multi_match", query=keyword, fields=['title', 'body'])
    # s = s.query(q)

    # def search()
    s = Search(using=client)
    # s = Search(using=client, index="pet-index").query("match", content="金毛")
    s = Search(using=client, index="pet-index").query(q)
    s = s[0:100]
    response = s.execute()
    return response


    # for hit in response:
    #     print(hit.meta)
Esempio n. 9
0
def data_pre_train_mongo( data_path='data/data.json',train_path='data/train_db.txt' ):
    """
    from=0  #文章开始id
    limit=10 # 返回文章数目
    >>>data_pre_train(from=0, limit=10)
    [unused5] 标记关键词
      [unused6]  标记标题
    [unused7]  标记前文标题
       [unused8]  标记正文
    """
    LANGUAGE = "chinese"
    SENTENCES_COUNT = 10
    article_max_len=500
    ttext=tkitText.Text()

    jieba.load_userdict('dict.txt')
    jieba.analyse.set_stop_words('stopwords.txt')

    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    # ie=tkitNlp.TripleIE(model_path="/mnt/data/dev/model/ltp/ltp_data_v3.4.0")
    f1 = open(train_path,'w')
    # articles=[]
    tt=tkitText.Text()
    # 引入TF-IDF关键词抽取接口
    tfidf = analyse.extract_tags
    # 引入TextRank关键词抽取接口
    textrank = analyse.textrank
    #这里定义mongo数据
    client = pymongo.MongoClient("localhost", 27017)
    DB_kg_scrapy = client.kg_scrapy
    print(DB.name)
    q={}
    # print('q',q)
    tclass = classify(model_name_or_path='tkitfiles/check_pet',num_labels=10,device='cuda')
    Ner=get_ner()
    # nlp=Nlp()
    i=0
    # for item in DB_kg_scrapy.kg_content.find(q):
    tjson=tkitFile.Json(file_path=data_path)
    for item in tqdm(tjson.auto_load()):
        i=i+1
        if i%10000==0:
            print(i)
        # print(item)
        if len(item['content'])>500:
            SENTENCES_COUNT = 5
        else:
            SENTENCES_COUNT = 3
        parser = PlaintextParser.from_string(item['content'], Tokenizer(LANGUAGE))
        l=[]
        words_list=[]
        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            l.append(str(sentence))
            # ner_list=Ner.pre(str(sentence))
            # for it in ner_list[0][1]:
            #     words_list.append(it.get("words"))
        # keywords = textrank(item['title']+'\n'+item['content'], topK=10, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')) 
        keywords = textrank(item['title']+'\n'+item['content'], topK=10, withWeight=False,) 
        keyphrases =tt.get_keyphrases(item['title']+'\n'+item['content'])

        # print("==="*20)
        # print("",item['title'])
        # print(item['content'][:100])
        p=tclass.pre(item['content'])
        # print("预测结果",p)
        # softmax=tclass.softmax()
        # print(softmax)
        # sentences=tt.sentence_segmentation_v1( item['title']+'。'+item['content'])
        # words_list=[]
        # for sentence in sentences:
        #     ner_list=Ner.pre(sentence)
        #     for it in ner_list[0][1]:
        #         words_list.append(it.get("words"))
        # # print(words_list)
        # keywords=keywords+keyphrases+words_list
        keywords=keywords+keyphrases
        keywords=list(set(keywords))
        # print(ner_list)
        content=" [KW] "+",".join(keywords)+" [/KW]  [TT] "+ item['title']+" [/TT] [SM] "+"".join(l)+" [/SM] [CONTNET] "+item['content']+" [/CONTNET] [PT] "+ item['title']+" [/PT] [END]"
        content=content.replace("\n\n\n", "\n\n")
        content=content.replace("\n", " [SEP] ")
        # print(content[:100])
        # content_list=cut_text(content,480)
        # for it in content_list:
        #     print("++++"*20)
        #     print(it)
        # f1.write("\n".join(content_list)+"")
        if p==1:
            f1.write(content)
            f1.write("\n")