Exemple #1
0
def data_pre_train_mongo_text(train_path='data/train/' ):
    """
    构建文本数据将单篇文章一个txt文件
    """
 
    # tt=tkitText.Text()
    #这里定义mongo数据
    # client = pymongo.MongoClient("localhost", 27017)
    # DB_kg_scrapy = client.kg_scrapy

    # q={}
    i=0
    # content_pet
    # for item in DB_kg_scrapy.kg_content.find(q):
    time_path='0'
    ttf=tkitFile.File()
    for item in tqdm(DB.content_pet.find({})):
        i=i+1
        if i%10000==0:
            ttf.mkdir(train_path+time_path)
            time_path =str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
        #     break
        name= str(int(time.time()))+item['title'][:10]+".txt"
        # file_path=os.path.join(train_path,name)
        file_path=train_path+time_path+"/"+name
        # print(file_path)
        try:
            with open(file_path,'w',encoding = 'utf-8') as f1:
                f1.write(item['title']+"\n")
                f1.write(item['content']+"\n")
        except:
            pass
Exemple #2
0
    def download_model(self):
        """自动下载模型"""
        tfile = tkitFile.File()
        tfile.mkdir('tkitfiles/')
        tfile.mkdir('tkitfiles/' + self.model_version)
        th = tkitWeb.Http()
        # th.test()
        # 下载文件
        url = "http://cdn.terrychan.org/model/tkit/tkitMarker/" + self.model_version + "/pytorch_model.bin"
        name = "pytorch_model.bin"
        data = th.download(url,
                           name,
                           dirname='tkitfiles/' + self.model_version)
        print(data)

        url = "http://cdn.terrychan.org/model/tkit/tkitMarker/" + self.model_version + "/config.json"
        name = "config.json"
        data = th.download(url,
                           name,
                           dirname='tkitfiles/' + self.model_version)

        url = "http://cdn.terrychan.org/model/tkit/tkitMarker/" + self.model_version + "/vocab.txt"
        name = "vocab.txt"
        data = th.download(url,
                           name,
                           dirname='tkitfiles/' + self.model_version)
        # print(data)
        url = "http://cdn.terrychan.org/model/tkit/tkitMarker/" + self.model_version + "/tag.txt"
        name = "tag.txt"
        data = th.download(url,
                           name,
                           dirname='tkitfiles/' + self.model_version)
def save_data(data, path='data/', name="train.json"):
    """
    保存数据
    """
    tkitFile.File().mkdir(path)
    data_path = path + name
    tjson = tkitFile.Json(file_path=data_path)
    tjson.save(data)
Exemple #4
0
def csv_list(path="data/csv/"):
    f = tkitFile.File()
    csv_list=f.file_List(path, type='csv')
    for line in csv_list:
        print('add:',line)
        try:
            data=csv_data(file_path=line)

            add_data(data=data)
        except:
            print('csv文件有误跳过')
Exemple #5
0
def add_data(data,path='data/',name="data.json"):
    """
    添加数据样本
    data=[{"keywords": "哈士奇,主人,嚎叫,便是,说明,思考,没有,犬种,原因,新手,", "content": "新手养狗,哈是无忧无的经验和耐心。"}]

    """
    tkitFile.File().mkdir(path)
    data_path=path+name
    tjson=tkitFile.Json(file_path=data_path)

    tjson.save(data)
    return   tjson.auto_load()
    def save_to_json_kg(self):
        """
        保存知识提取训练集
        https://www.kaggle.com/terrychanorg/albert-bilstm-crf-pytorch/data
        """
        tkitFile.File().mkdir("../tdata/kg")
        kgjson_t = tkitFile.Json("../tdata/kg/train.json")
        kgjson_d = tkitFile.Json("../tdata/kg/dev.json")
        # kgjson_l=tkitFile.Json("../tdata/labels.json")

        self.tdb.load("kg_mark_unique_data")
        data = []
        all_data_id = []
        for k, v in self.tdb.get_all():
            # print("k",k)
            try:
                it = self.tdb.str_dict(v)
                # print("it",it)
                label = ['O'] * len(it['sentence'])
                s = 0
                for one in it['kgs']:
                    label, s1 = self.mark_word_label(it['sentence'], label,
                                                     one[2], "描述")
                    if s1 >= 0:
                        s = s + 1
                    # label,s1=self.mark_word_label(it['sentence'],label,one[1],"关系")
                # print(label)
                d = {
                    'text':
                    list(one[0] + '#' + one[1] + '#' + it['sentence']),
                    'label': ['K'] * len(one[0]) + ['X'] +
                    ['P'] * len(one[1]) + ['X'] + label
                }
                # print(d)
                # print(d)
                if s > 0:
                    # print(s)
                    data.append(d)
            except:
                # self.tdb.load("kg")
                continue
        c = int(len(data) * 0.85)
        kgjson_t.save(data[:c])
        kgjson_d.save(data[c:])
        print("总共生成数据", len(data))
        #自动处理重复标记问题
        self.json_remove_duplicates("../tdata/kg/train.json")
        self.json_remove_duplicates("../tdata/kg/dev.json")
        print("已经将数据导出到 ../tdata/kg")
Exemple #7
0
def save_collection(collection_name,DB):
    """
    保存一张表
    """
    path=os.path.join("data",DB.name)
    tkitFile.File().mkdir(path)
    json_save=os.path.join("data",DB.name,collection_name+".json")
    json_backup=tkitFile.Json(json_save)
    for it in DB[collection_name].find():
        # print(it)
        try:
            json_backup.save([it])
        except:
            print("error")
            print(it)
            pass
Exemple #8
0
def data_pre_train_file(path='./data/'):
    """
    生成训练样本
    """
    tkitFile.File().mkdir(path)
    train_path=path+'train.txt'
    task_path=path+'task.json'
    data_path=path+'data.json'
    tjson=tkitFile.Json(file_path=task_path)

    # try:
    #     tasks=tjson.load()
    #     task=tasks[0]
    #     os.remove(task_path)
    # except:
    #     # task=[]
    #     task={"tfrom":0,'limit':10}
    data_pre_train(data_path=data_path,train_path=train_path)
def data_pre_train_mongo_text(keyword, train_path='../data/'):
    """
    构建文本数据将单篇文章一个txt文件
    """

    # tt=tkitText.Text()
    #这里定义mongo数据
    # client = pymongo.MongoClient("localhost", 27017)
    # DB_kg_scrapy = client.kg_scrapy

    # q={}
    i = 0
    # content_pet
    # for item in DB_kg_scrapy.kg_content.find(q):
    # time_path='0'
    ttf = tkitFile.File()
    ttf.mkdir(train_path + keyword)
    for item in tqdm(search_content(keyword)):
        i = i + 1
        # if i%10000==0:

        # time_path =str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
        #     break

        # #跳过大于1000的文章
        # if len(item.content)>1000:
        #     continue
        if len(item.title + item.content) < 50:
            continue
        p = tclass.pre(item.content)
        # print(p)
        if p == 0:
            continue
        name = str(int(time.time())) + item.title[:10] + ".txt"
        # file_path=os.path.join(train_path,name)
        file_path = train_path + keyword + "/" + name
        # print(file_path)
        try:
            with open(file_path, 'w', encoding='utf-8') as f1:

                f1.write(item.title + "\n")
                f1.write(item.content[:500] + "\n")
        except:
            pass
Exemple #10
0
    def save_to_json_SQuAD(self):

        tkitFile.File().mkdir("../tdata/SQuAD")
        kgjson_t=tkitFile.Json("../tdata/SQuAD/train.json")
        kgjson_d=tkitFile.Json("../tdata/SQuAD/dev.json")
        data=[]
        all_data_id=[]
        for it in DB.kg_mark_unique_data.find():
            k=it['_id']
            ner={}


            one_q={
                "id":k+"_s",
                "context":it['sentence'],
                "qas":[]
            }
            for one in it['kgs']:
                try:
                    if one[1] not in ner[one[0]]:
                        ner[one[0]].append(one[1])
                except:
                    ner[one[0]]=[one[1]]
            answers=[]
            for nr in ner:
                s=0
                
                label=['O']*len(it['sentence'])
                # print(ner[nr])
                for n in ner[nr]:
                    try:
                        label,s1=self.mark_word_label(it['sentence'],label,n,"关系")
                        if s1>=0:
                            answers_one={
                                "answer_start": s1,
                                "text": n
                            }
                            answers.append(answers_one)
                    except:
                        pass
                if len(answers)>0:
                    # #构造一条ner预测数据
                    one_q['qas'].append({
                        "question":nr,
                        'id':k+"_ner_rel_"+nr,
                        'answers':answers
                        
                    })
            if len(one_q['qas'])>0:
                one_kg={
                    'paragraphs':[one_q],
                    'id':k+"_kg",
                    'title':it['sentence'][:10]
                }
                # print((one_kg))
                data.append(one_kg)
 
        
        # data=data[0:1000]
        c=int(len(data)*0.85)
        t=data[:c]
        d=data[c:]
        t_data={  
            "version": "v1.0", 
            "data": t
        }
        d_data={  
            "version": "v1.0", 
            "data": d
        }

        
        kgjson_t.save([t_data])
        kgjson_d.save([d_data])
        print("总共生成数据",len(data))
        #自动处理重复标记问题
        # self.json_remove_duplicates("../tdata/SQuAD/train.json")
        # self.json_remove_duplicates("../tdata/SQuAD/dev.json")
        print("已经将数据导出到 ../tdata/SQuAD")
Exemple #11
0
                for w,m in it:
                    labels[m]=1
                    # print(m,w)
            keys=[]
            for key in labels.keys():
                keys.append(key)
            f1.write("\n".join(keys))


# data_path='../data'
data_path=input("Data Path:")
if data_path:
    pass
else:
    data_path="/home/t/dev/auto-translation-plan/clear-content-marker/data/"
ttf=tkitFile.File()
tt=tkitText.Text()
data=[]
anns=[]
bad=0
good=0
bad_files=[]
for f_path in ttf.all_path(data_path):
    # print(f_path)
    if f_path.endswith(".anns"):
        # print(f_path)
        anns.append(f_path)
        # print(_read_data(f_path))
        one_data=_read_data(f_path)
        # print(one_data)
        if  len(one_data)==0:
import tkitFile
"""
将标记好的数据分割保存为训练数据集

"""

tkitFile.File().mkdir("data/train")
mjson = tkitFile.Json("data/marked.json")
tjson = tkitFile.Json("data/train/train.json")
djson = tkitFile.Json("data/train/dev.json")
data = []
for item in mjson.auto_load():
    print(item)
    data.append(item)
c = len(data) * 0.8
tjson.save(data[:int(c)])
djson.save(data[int(c):])
Exemple #13
0
 def save_to_json_kg_tmark(self):
     """
     保存知识提取训练集
     https://github.com/napoler/tmark_Description
     https://www.kaggle.com/terrychanorg/tmark-description
     # https://www.kaggle.com/terrychanorg/albert-bilstm-crf-pytorch/data
     """
     tkitFile.File().mkdir("../tdata/kg_tmark")
     kgjson_t=tkitFile.Json("../tdata/kg_tmark/train.json")
     kgjson_d=tkitFile.Json("../tdata/kg_tmark/dev.json")
     # kgjson_l=tkitFile.Json("../tdata/labels.json")
     
     # self.tdb.load("kg_mark_unique_data")
     data=[]
     all_data_id=[]
     for it in DB.kg_mark_unique_data.find():
         # print("k",k)
         k=it['_id']
         try: 
             # it=self.tdb.str_dict(v)
             # print("it",it)
             label=['O']*len(it['sentence'])
             s=0
             # print('222')
             for one in it['kgs']:
                 label,s1=self.mark_word_label(it['sentence'],label,one[1]+one[2],"描述")
                 if s1>=0:
                     s=s+1
                 # label,s1=self.mark_word_label(it['sentence'],label,one[1],"关系")
             # print(label)
             d={'text':list(one[0])+['[SEP]']+list(it['sentence']),'label':['实体']*len(one[0])+['X']+label}
             # print(d)
             # print(len(d['text']),len(d['label']))
             # print(d)
             if len(d['text'])==len(d['label']):
                 one_kg_tmk=[]
                 for  t,tmk_l in zip(d['text'],d['label']):
                     # print(t,tmk_l)
                     one_kg_tmk.append((t,tmk_l))
                 if s>0:
                     # print(s)
                     # print(one_kg_tmk)
                     data.append(one_kg_tmk)
         except:
             # self.tdb.load("kg")
             continue
     # c=int(len(data)*0.85)
     # kgjson_t.save(data[:c])
     # kgjson_d.save(data[c:])
     print("总共生成数据",len(data))
     #自动处理重复标记问题
     # self.json_remove_duplicates("../tdata/kg_tmark/train.json")
     # self.json_remove_duplicates("../tdata/kg_tmark/dev.json")
     c=int(len(data)*0.7)
     b=int(len(data)*0.85)
     print(data[:10])
     print(len(data))
     train_data=data[:c]
     dev_data=data[c:b]
     test_data=data[b:]
     self.save_data(train_data,file="../tdata/kg_tmark/train.txt")
     self.save_data(dev_data,file="../tdata/kg_tmark/dev.txt")
     self.save_data(test_data,file="../tdata/kg_tmark/test.txt")
     self.save_labels(data,"../tdata/kg_tmark/labels.txt")
     print("已经将数据导出到 ../tdata/kg_tmark")
Exemple #14
0
    def save_to_json_ner_bio(self):
        tkitFile.File().mkdir("../tdata/onlyner_bio")
        # self.tdb.load("kg_mark_unique_data")
        data=[]
        all_data_id=[]
        nlp_plus=tkitNlp.Plus()
        nlp_plus.load_tlp()
        flags={}
        for it in DB.kg_mark_unique_data.find():
            # print("k",k)
            k=it['_id']
            try: 
                # it=self.tdb.str_dict(v)
                text=it['sentence']
                # print("it",it)
                label= ["O"]*len(text)
                ner={}
                for one in it['kgs']:
                    # print(one)
                    # label,s1=self.mark_word_label(it['sentence'],label,one[0],"实体")
                    # label,s1=self.mark_word_label(it['sentence'],label,one[1],"关系")
                    try:
                        if one[1] not in ner[one[0]]:
                            ner[one[0]].append(one[1])
                    except:
                        ner[one[0]]=[one[1]]
                # print("++++++"*10)
                # print('text',text)                
                ner_list =[tmp for  tmp in ner.keys() ]
                # print('ner_list',ner_list)   
                # print(ner_list)
                # fner =[word for word,flag in nlp_plus.ner(text)]
                fner=[]
                for word,flag in nlp_plus.ner(text):
                    flags[flag]=0
                    fner.append(word)
                ner_list=list(set(ner_list+fner))
                ner_list = sorted(ner_list,key = lambda i:len(i),reverse=False)

                # print('ner_list',ner_list)
                s=0
                for nr in ner_list:
                
                    # print(nr)
                    label,s1=nlp_plus.mark_word_label(text,label,nr,"实体")
                    if s1>=0:
                        s=s+1
                if s>0:
                    # one={'text':list(text),'label':label}
                    one=[]
                    for it_w,it_l in zip(list(text),label):
                        one.append((it_w,it_l))

                    data.append(one)
                    
                    # print(flags)
            except:
                pass
        nlp_plus.release()
        c=int(len(data)*0.7)
        b=int(len(data)*0.85)
        print(data[:10])
        print(len(data))
        train_data=data[:c]
        dev_data=data[c:b]
        test_data=data[b:]
        self.save_data(train_data,file="../tdata/onlyner_bio/train.txt")
        self.save_data(dev_data,file="../tdata/onlyner_bio/dev.txt")
        self.save_data(test_data,file="../tdata/onlyner_bio/test.txt")
        self.save_labels(data,"../tdata/onlyner_bio/labels.txt")
Exemple #15
0
    def save_to_json_ner(self):
        tkitFile.File().mkdir("../tdata/onlyner")
        kgjson_t=tkitFile.Json("../tdata/onlyner/train.json")
        kgjson_d=tkitFile.Json("../tdata/onlyner/dev.json")
        # kgjson_l=tkitFile.Json("../tdata/labels.json")
        self.tdb.load("kg_mark_unique_data")
        data=[]
        all_data_id=[]
        nlp_plus=tkitNlp.Plus()
        nlp_plus.load_tlp()
        flags={}
        for it in DB.kg_mark_unique_data.find():
            # print("k",k)
            k=it['_id']
            try: 
                # it=self.tdb.str_dict(v)
                text=it['sentence']
                # print("it",it)
                label= ["O"]*len(text)
                ner={}
                for one in it['kgs']:
                    # print(one)
                    # label,s1=self.mark_word_label(it['sentence'],label,one[0],"实体")
                    # label,s1=self.mark_word_label(it['sentence'],label,one[1],"关系")
                    try:
                        if one[1] not in ner[one[0]]:
                            ner[one[0]].append(one[1])
                    except:
                        ner[one[0]]=[one[1]]
                # print("++++++"*10)
                # print('text',text)                
                ner_list =[tmp for  tmp in ner.keys() ]
                # print('ner_list',ner_list)   
                # print(ner_list)
                # fner =[word for word,flag in nlp_plus.ner(text)]
                fner=[]
                for word,flag in nlp_plus.ner(text):
                    flags[flag]=0
                    fner.append(word)
                ner_list=list(set(ner_list+fner))
                ner_list = sorted(ner_list,key = lambda i:len(i),reverse=False)

                # print('ner_list',ner_list)
                s=0
                for nr in ner_list:
                
                    # print(nr)
                    label,s1=nlp_plus.mark_word_label(text,label,nr,"实体")
                    if s1>=0:
                        s=s+1
                if s>0:
                    one={'text':list(text),'label':label}
                    data.append(one)
                    
                    # print(flags)
            except:
                pass
        nlp_plus.release()
 
        c=int(len(data)*0.85)
        kgjson_t.save(data[:c])
        kgjson_d.save(data[c:])
        print("总共生成数据",len(data))
        #自动处理重复标记问题
        self.json_remove_duplicates("../tdata/onlyner/train.json")
        self.json_remove_duplicates("../tdata/onlyner/dev.json")
        print("已经将数据导出到 .../tdata/onlyner/")
Exemple #16
0
 def __init__(self):
     tkitFile.File().mkdir("../tdata")
     
     self.tdb= tkitDb.LDB(path="../tdata/lv.db")
     self.ss=tkitSearch.Search()
     pass
Exemple #17
0
import warnings
warnings.filterwarnings(action = 'ignore', category = UserWarning, module = 'gensim')#忽略警告

import logging
import os.path
import sys
import multiprocessing

from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence,PathLineSentences
import gensim
import tkitFile

ff=tkitFile.File()

#program = os.path.basename(sys.argv[0])
#logger = logging.getLogger(program)

#logging.basicConfig(format = '%(asctime)s: %(levelname)s:' %('message)s'),level = logging.INFO)
#logger.info("running %s" % ' '.join(sys.argv))

fdir = 'model/'
ddir='data/train/'
ff.mkdir("model/")
ff.mkdir("data")
ff.mkdir("data/train")
inp = ddir + 'data_seg.txt'
outp1 = fdir + 'word2vec.model'
outp2 = fdir + 'word2vec.vector'
from relation_extraction import getRelation
from utils import readFile
import tkitFile
import gc
# from albert_pytorch import classify
tfile = tkitFile.File()
# Tclass = classify(model_name_or_path='tkitfiles/checkkg')
# def pre(data):
#     """
#     获取预测结果
#     """
#     # tkg = "[kg] "+",".join(data['kg'])+" [/kg] "+data['sentence']
#     data['sentence_b']=",".join(data['kg'])
#     p = Tclass.pre(data['sentence'], data['sentence_b'])
#     softmax = Tclass.softmax()
#     Tclass.release
#     print("分类", "|", '概率')
#     pre = []
#     for ck, rank in zip([1, 2], softmax):
#         print(ck, "|", rank)
#         pre.append([ck, round(rank, 4)])
#     # del Tclass
#     gc.collect()
#     return p+1, pre

path = "/mnt/data/dev/tdata/wiki_zh"
relations_all = []
for f in tfile.all_path(path):
    # para = readFile('./wiki_00')
    print(f)
    para = readFile(f)
Exemple #19
0
    def save_to_json_ner_rel(self):
        tkitFile.File().mkdir("../tdata/ner")
        kgjson_t=tkitFile.Json("../tdata/ner_rel/train.json")
        kgjson_d=tkitFile.Json("../tdata/ner_rel/dev.json")
        # kgjson_l=tkitFile.Json("../tdata/labels.json")
        # self.tdb.load("kg_mark_unique_data")
        data=[]
        all_data_id=[]
        # for k,v in self.tdb.get_all():
        for it in DB.kg_mark_unique_data.find():
            # print("k",k)
            k=it['_id']
            try: 
                # it=self.tdb.str_dict(v)
                # print("it",it)
                
                ner={}
                for one in it['kgs']:
                    # print(one)
                    # label,s1=self.mark_word_label(it['sentence'],label,one[0],"实体")
                    # label,s1=self.mark_word_label(it['sentence'],label,one[1],"关系")
                    try:
                        if one[1] not in ner[one[0]]:
                            ner[one[0]].append(one[1])
                    except:
                        ner[one[0]]=[one[1]]
                # print(ner)
                
                for nr in ner:
                    s=0
                    label=['O']*len(it['sentence'])
                    # print(ner[nr])
                    for n in ner[nr]:
                        # print(n)
                        # print("label",label)
                        # print(it['sentence'])
                        label,s1=self.mark_word_label(it['sentence'],label,n,"关系")
                        # print(label,s1)
                        if s1>=0:
                            s=s+1
                    if s>0:
                        one_ner={'text':list(nr+'#'+it['sentence']),'label':['K']*len(nr)+['X']+label}
                        data.append(one_ner)
                        # print(one_ner)




                # # print(label)
                # d={'text':list(it['sentence']),'label':label}
                # # print(d)
                # # print(d)
                # data.append(d)
            except:
                # self.tdb.load("kg")
                continue
        c=int(len(data)*0.85)
        kgjson_t.save(data[:c])
        kgjson_d.save(data[c:])
        print("总共生成数据",len(data))
        #自动处理重复标记问题
        self.json_remove_duplicates("../tdata/ner_rel/train.json")
        self.json_remove_duplicates("../tdata/ner_rel/dev.json")
        print("已经将数据导出到 ../tdata/ner_rel")