コード例 #1
0
def get_predict_title(text, plen, n):
    ttext = tkitText.Text()
    tid = str(text) + str(plen) + str(n)
    tid = ttext.md5(tid)

    data_path = "tmp/run_task" + tid + ".json"
    print('load', data_path)
    if not os.path.exists(data_path):
        # 不存在缓存,重新预测
        cmd = "python3 ./generate.py --prefix '''" + text + "''' --length " + str(
            plen) + " --nsamples " + str(n) + " --tid " + str(tid)
        print("开始处理: " + cmd)
        # print(subprocess.call(cmd, shell=True))
        if subprocess.call(cmd, shell=True) == 0:

            try:
                tjson = tkitFile.Json(file_path=data_path)
                return tjson.load()[0]['data']
            except:
                print('load文件失败', data_path)
                return {}
                pass
        else:
            return {}
    else:
        #加载缓存预测
        try:
            tjson = tkitFile.Json(file_path=data_path)
            return tjson.load()[0]['data']
        except:
            print('load文件失败', data_path)
            return {}
コード例 #2
0
    def json_remove_duplicates(self,json_file):
        print("尝试移除重复数据")
        origin_json=tkitFile.Json(json_file)
        temp=tkitFile.Json(json_file+".tmp.json")
        tt=tkitText.Text()
        temp_keys=[]
        data=[]
        num_duplicates=0
        for i, item in enumerate(origin_json.auto_load()):

            # if i%10000==0:
                # print("~~~~"*10)
                # print('已经处理',i)
                # temp.save(data)
                # data=[]
            # key=tt.md5(str(item))
            # if key in temp_keys:
            #     # print("重复数据",item)
            #     num_duplicates=num_duplicates+1
            #     pass
            # else:
            #     temp_keys.append(key)
            #     data.append(item)
            data.append(json.dumps(item))
        new=list(set(data))
        print("原始长度",len(data))
        new_json=[]
        for item in new:
            new_json.append(json.loads(item))
        print("新长度",len(new_json))
        temp.save(new_json)
        print("移除重复内容",num_duplicates)
        #覆盖之前文件
        shutil.move(json_file+".tmp.json",json_file)
コード例 #3
0
    def save_to_json_kg(self):
        """
        保存知识提取训练集
        https://www.kaggle.com/terrychanorg/albert-bilstm-crf-pytorch/data
        """
        tkitFile.File().mkdir("../tdata/kg")
        kgjson_t = tkitFile.Json("../tdata/kg/train.json")
        kgjson_d = tkitFile.Json("../tdata/kg/dev.json")
        # kgjson_l=tkitFile.Json("../tdata/labels.json")

        self.tdb.load("kg_mark_unique_data")
        data = []
        all_data_id = []
        for k, v in self.tdb.get_all():
            # print("k",k)
            try:
                it = self.tdb.str_dict(v)
                # print("it",it)
                label = ['O'] * len(it['sentence'])
                s = 0
                for one in it['kgs']:
                    label, s1 = self.mark_word_label(it['sentence'], label,
                                                     one[2], "描述")
                    if s1 >= 0:
                        s = s + 1
                    # label,s1=self.mark_word_label(it['sentence'],label,one[1],"关系")
                # print(label)
                d = {
                    'text':
                    list(one[0] + '#' + one[1] + '#' + it['sentence']),
                    'label': ['K'] * len(one[0]) + ['X'] +
                    ['P'] * len(one[1]) + ['X'] + label
                }
                # print(d)
                # print(d)
                if s > 0:
                    # print(s)
                    data.append(d)
            except:
                # self.tdb.load("kg")
                continue
        c = int(len(data) * 0.85)
        kgjson_t.save(data[:c])
        kgjson_d.save(data[c:])
        print("总共生成数据", len(data))
        #自动处理重复标记问题
        self.json_remove_duplicates("../tdata/kg/train.json")
        self.json_remove_duplicates("../tdata/kg/dev.json")
        print("已经将数据导出到 ../tdata/kg")
コード例 #4
0
def get_data(path, tokenizer):

    # temp=tkitFile.Json('data/cache/train.json')
    for it in tkitFile.Json(path).auto_load():
        item = {}
        # print(it)

        kw = tokenizer.encode_plus(it['keywords'],
                                   max_length=tokenizer.max_len,
                                   add_special_tokens=True)
        pad_num = tokenizer.max_len - len(kw['input_ids'])
        item['keywords'] = kw['input_ids'] + [
            tokenizer.convert_tokens_to_ids('[PAD]')
        ] * pad_num

        tx = tokenizer.encode_plus(it['text'],
                                   max_length=tokenizer.max_len,
                                   add_special_tokens=True)
        pad_num = tokenizer.max_len - len(tx['input_ids'])
        item['text'] = tx['input_ids'] + [
            tokenizer.convert_tokens_to_ids('[PAD]')
        ] * pad_num

        #   inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
        # print(item)
        yield item
コード例 #5
0
ファイル: glue.py プロジェクト: napoler/albert_pytorch
 def make_labels(self):
     tjosn = tkitFile.Json(file_path=self.data_dir +
                           "/labels.json").auto_load()
     labels = []
     for item in tjosn:
         labels.append(str(item['label']))
     self.labels = labels
コード例 #6
0
ファイル: glue.py プロジェクト: napoler/albert_pytorch
 def get_dev_examples(self, data_dir):
     """See base class."""
     self.data_dir = data_dir
     file_path = os.path.join(self.data_dir, "dev.json")
     # bulid_labels(self,data_dir)
     tjosn = tkitFile.Json(file_path=file_path).auto_load()
     return self._create_examples(tjosn, 'dev')
コード例 #7
0
def get_keys(data_path=""):
    tjson=tkitFile.Json(file_path=data_path)
    keys=[]
    for it in tjson.auto_load():
        key=tkitText.Text().md5(it['sentence'])
        keys.append(key)
    return list(set(keys))
コード例 #8
0
def save_loss(loss,name="default"):
    """
    保存loss 方便以后绘图分析
    """
    file_path="dataset/"+name+".json"
    tjosn=tkitFile.Json(file_path=file_path)
    one={"time":time.time(), 'loss':loss }
    tjosn.save([one])
コード例 #9
0
def save_data(data, path='data/', name="train.json"):
    """
    保存数据
    """
    tkitFile.File().mkdir(path)
    data_path = path + name
    tjson = tkitFile.Json(file_path=data_path)
    tjson.save(data)
コード例 #10
0
    def save_to_json(self):
        """
        可以用于测试知识是否是合理的
        """
        kgjson_t = tkitFile.Json("../tdata/kg_check/train.json")
        kgjson_d = tkitFile.Json("../tdata/kg_check/dev.json")
        kgjson_l = tkitFile.Json("../tdata/kg_check/labels.json")
        # self.tdb.load("kg_mark")
        data = []
        i = 0
        n = 0
        self.tdb.load("kg_mark")
        tt = tkitText.Text()
        i = -1
        for k, v in self.tdb.get_all():
            i = i + 1
            # print(v)
            if v == None:
                n += 1
            else:
                try:
                    it = self.tdb.str_dict(v)
                    one = {}
                    one['sentence'] = " [kg] " + ",".join(
                        it['kg']) + " [/kg] " + it['sentence']
                    one['label'] = it['label'] - 1

                    if int(one['label']) in [0, 1] and len(
                            it['kg']) == 3 and it.get(
                                'check') != None and it.get('state') == '2':
                        data.append(one)
                    else:
                        # print(it)
                        pass
                except:
                    # self.tdb.load("kg")
                    continue
        c = int(len(data) * 0.85)
        print("总数据", len(data), i, n)
        kgjson_t.save(data[:c])
        kgjson_d.save(data[c:])
        #自动处理重复标记问题
        self.json_remove_duplicates("../tdata/kg_check/train.json")
        self.json_remove_duplicates("../tdata/kg_check/dev.json")
        print("已经将数据导出到 ../tdata/kg_check")
コード例 #11
0
def add_data(data,path='data/',name="data.json"):
    """
    添加数据样本
    data=[{"keywords": "哈士奇,主人,嚎叫,便是,说明,思考,没有,犬种,原因,新手,", "content": "新手养狗,哈是无忧无的经验和耐心。"}]

    """
    tkitFile.File().mkdir(path)
    data_path=path+name
    tjson=tkitFile.Json(file_path=data_path)

    tjson.save(data)
    return   tjson.auto_load()
コード例 #12
0
def save_collection(collection_name,DB):
    """
    保存一张表
    """
    path=os.path.join("data",DB.name)
    tkitFile.File().mkdir(path)
    json_save=os.path.join("data",DB.name,collection_name+".json")
    json_backup=tkitFile.Json(json_save)
    for it in DB[collection_name].find():
        # print(it)
        try:
            json_backup.save([it])
        except:
            print("error")
            print(it)
            pass
コード例 #13
0
ファイル: readtest.py プロジェクト: napoler/gpt2Write
def read():
    """
    构建下一句语料
    from=0  #文章开始id
    limit=10 # 返回文章数目
    >>>data_pre_train(from=0, limit=10)


    """

    i = 0
    n = 0
    data = []
    tt = tkitText.Text()
    data_json = tkitFile.Json(file_path='data/train.json')
    for it in data_json.auto_load():
        print(it)
コード例 #14
0
ファイル: bulid_data.py プロジェクト: napoler/gpt2Write
def data_pre_train_file(path='./data/'):
    """
    生成训练样本
    """
    tkitFile.File().mkdir(path)
    train_path=path+'train.txt'
    task_path=path+'task.json'
    data_path=path+'data.json'
    tjson=tkitFile.Json(file_path=task_path)

    # try:
    #     tasks=tjson.load()
    #     task=tasks[0]
    #     os.remove(task_path)
    # except:
    #     # task=[]
    #     task={"tfrom":0,'limit':10}
    data_pre_train(data_path=data_path,train_path=train_path)
コード例 #15
0
ファイル: glue.py プロジェクト: napoler/albert_pytorch
 def bulid_labels(self):
     """See base class 基于数据构建label词典."""
     # print("self.data_dir",self.data_dir)
     file_path = os.path.join(self.data_dir, "all_50_schemas.json")
     data = tkitFile.Json(file_path=file_path).auto_load()
     # tjson=tkit.Json(file_path=os.path.join('data/all_50_schemas.json'))
     # data= tjson.auto_load()
     # print(len(data))
     labels = []
     for i, it in enumerate(data):
         labels.append(it['predicate'])
     # labels = list(set(labels))
     labels = {}.fromkeys(labels).keys()
     # print(labels)
     labels_dict = {"NULL": 0}
     for i, it in enumerate(labels, 1):
         labels_dict[it] = str(i)
     # print(labels_dict)
     return labels_dict
コード例 #16
0
ファイル: bulid_data.py プロジェクト: napoler/gpt2Write
def data_pre_train_mongo_summary( data_path='data/data.json',train_path='data/train_db_Summary.txt' ):
    """
    from=0  #文章开始id
    limit=10 # 返回文章数目
    >>>data_pre_train(from=0, limit=10)
 
 
    """

    f1 = open(train_path,'w')

    i=0
    # data=[]
    # tt=tiktThreading.TT(5)
    # for item in get_one():
    tjson=tkitFile.Json(file_path=data_path)
    for item in tqdm(tjson.auto_load()):
        i=i+1
        if i%10000==0:
            print(i)
        args={'item':item,'f1':f1}
        add_one(args)
コード例 #17
0
def check_model():
    """
    对之前的训练数据重新筛选
    """
    tjson = tkitFile.Json(file_path="data/classifypet/train.json")
    # tjson_b=tkitFile.Json(file_path="data/classifypet/train_b.json")
    a = 0
    b = 0
    data = []
    for it in tjson.auto_load():
        # print(it)
        a = a + 1
        p = petclass.pre(it['sentence'])
        if p == it['label']:
            b = b + 1
        else:
            print(it['sentence'][:500])
            print(it['label'])
            mp = input("不一致:")
            it['label'] = int(mp)
        data.append(it)
        print("one", b, a, b / a)
    print(b, a, b / a)
    add_data(data, path='data/classifypet/', name="train_b.json")
コード例 #18
0
    def save_to_json_SQuAD(self):

        tkitFile.File().mkdir("../tdata/SQuAD")
        kgjson_t=tkitFile.Json("../tdata/SQuAD/train.json")
        kgjson_d=tkitFile.Json("../tdata/SQuAD/dev.json")
        data=[]
        all_data_id=[]
        for it in DB.kg_mark_unique_data.find():
            k=it['_id']
            ner={}


            one_q={
                "id":k+"_s",
                "context":it['sentence'],
                "qas":[]
            }
            for one in it['kgs']:
                try:
                    if one[1] not in ner[one[0]]:
                        ner[one[0]].append(one[1])
                except:
                    ner[one[0]]=[one[1]]
            answers=[]
            for nr in ner:
                s=0
                
                label=['O']*len(it['sentence'])
                # print(ner[nr])
                for n in ner[nr]:
                    try:
                        label,s1=self.mark_word_label(it['sentence'],label,n,"关系")
                        if s1>=0:
                            answers_one={
                                "answer_start": s1,
                                "text": n
                            }
                            answers.append(answers_one)
                    except:
                        pass
                if len(answers)>0:
                    # #构造一条ner预测数据
                    one_q['qas'].append({
                        "question":nr,
                        'id':k+"_ner_rel_"+nr,
                        'answers':answers
                        
                    })
            if len(one_q['qas'])>0:
                one_kg={
                    'paragraphs':[one_q],
                    'id':k+"_kg",
                    'title':it['sentence'][:10]
                }
                # print((one_kg))
                data.append(one_kg)
 
        
        # data=data[0:1000]
        c=int(len(data)*0.85)
        t=data[:c]
        d=data[c:]
        t_data={  
            "version": "v1.0", 
            "data": t
        }
        d_data={  
            "version": "v1.0", 
            "data": d
        }

        
        kgjson_t.save([t_data])
        kgjson_d.save([d_data])
        print("总共生成数据",len(data))
        #自动处理重复标记问题
        # self.json_remove_duplicates("../tdata/SQuAD/train.json")
        # self.json_remove_duplicates("../tdata/SQuAD/dev.json")
        print("已经将数据导出到 ../tdata/SQuAD")
コード例 #19
0
# all=0
# # ner_list=ner_plus(text)
# for item in ner_reljson.auto_load():

# The checkpoint albert-base-v2 is not fine-tuned for question answering. Please see the
# examples/run_squad.py example to see how to fine-tune a model to a question answering task.

from transformers import AlbertTokenizer, AlbertForQuestionAnswering, BertTokenizer, AlbertConfig
import torch

# tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
tokenizer = BertTokenizer.from_pretrained('tkitfiles/qa/model/')
# config=AlbertConfig.from_pretrained('tkitfiles/qa/model/config.json')
model = AlbertForQuestionAnswering.from_pretrained('tkitfiles/qa/model/')

data = tkitFile.Json("../tdata/SQuAD/dev.json")
i = 0
all = 0
f = 0
for item in data.auto_load():
    for one in item['data']:
        all = all + 1
        # print(one['paragraphs'][0])
        # print(one['paragraphs'][0]['context'])
        question, text = one['paragraphs'][0]['qas'][0]['question'], one[
            'paragraphs'][0]['context']

        # question, text = "利比里亚共和国", "利比里亚共和国(英语:') 通称赖比瑞亚,是位于西非,北接几内亚,西北界塞拉利昂,东邻象牙海岸,西南濒大西洋的总统制共和国家"
        input_dict = tokenizer.encode_plus(question, text, return_tensors='pt')
        start_scores, end_scores = model(**input_dict)
        # print(start_scores, end_scores)
コード例 #20
0
import tkitFile
"""
将标记好的数据分割保存为训练数据集

"""

tkitFile.File().mkdir("data/train")
mjson = tkitFile.Json("data/marked.json")
tjson = tkitFile.Json("data/train/train.json")
djson = tkitFile.Json("data/train/dev.json")
data = []
for item in mjson.auto_load():
    print(item)
    data.append(item)
c = len(data) * 0.8
tjson.save(data[:int(c)])
djson.save(data[int(c):])
コード例 #21
0
ファイル: auto_find_title.py プロジェクト: napoler/gpt2Write
# from search import  *
from config import *
import tkitText, tkitFile
import time
from tqdm import tqdm
data_path = "/mnt/data/dev/github/scrapy/scrapy_baidu/scrapy_baidu/scrapy_baidu/data/all.json"
tjson = tkitFile.Json(file_path=data_path)


def save():
    its = []
    i = 0
    for it in DB.kg_content.find({}):
        # print(it)
        i = i + 1
        its.append(it)
        if i % 10000 == 0:
            tjson.save(its)
            print(i)
            its = []
    tjson.save(its)


# save()

# # s=Search(name='Terry')
# # # s.init_search()
# # s.load()
# tt= tkitText.Text()
# with open(data_path, 'r') as f:
#     for i,line in tqdm(enumerate(f)):
コード例 #22
0
#             word.append(item['text'][lnum])
#         if l=="P":
#             word.append(item['text'][lnum])
#     print(''.join(word),words_list)

#     if item['label']==labels:
#         # print('准确')
#         i=i+1
#     all=all+1
#     if all%1==0:
#         print('准确率',i/all)
#     if all==limit:
#         break
# print('统计',i,all)

ner_reljson = tkitFile.Json("../tdata/kg/dev.json")
i = 0
all = 0
r = 0
# ner_list=ner_plus(text)
limit = 1000
q = {'check': True, "state": '2', 'label': int(2)}
print('q', q)
for item in DB.kg_mark.find(q):
    item['text'] = item['kg'][0] + '#' + item['kg'][1] + '#' + item['sentence']
    words_list, labels = get_Relationship_test(item)

    all = all + 1
    if len(words_list) > 0:
        if item['kg'][2] in words_list:
            i = i + 1
コード例 #23
0
 def save_to_json_kg_tmark(self):
     """
     保存知识提取训练集
     https://github.com/napoler/tmark_Description
     https://www.kaggle.com/terrychanorg/tmark-description
     # https://www.kaggle.com/terrychanorg/albert-bilstm-crf-pytorch/data
     """
     tkitFile.File().mkdir("../tdata/kg_tmark")
     kgjson_t=tkitFile.Json("../tdata/kg_tmark/train.json")
     kgjson_d=tkitFile.Json("../tdata/kg_tmark/dev.json")
     # kgjson_l=tkitFile.Json("../tdata/labels.json")
     
     # self.tdb.load("kg_mark_unique_data")
     data=[]
     all_data_id=[]
     for it in DB.kg_mark_unique_data.find():
         # print("k",k)
         k=it['_id']
         try: 
             # it=self.tdb.str_dict(v)
             # print("it",it)
             label=['O']*len(it['sentence'])
             s=0
             # print('222')
             for one in it['kgs']:
                 label,s1=self.mark_word_label(it['sentence'],label,one[1]+one[2],"描述")
                 if s1>=0:
                     s=s+1
                 # label,s1=self.mark_word_label(it['sentence'],label,one[1],"关系")
             # print(label)
             d={'text':list(one[0])+['[SEP]']+list(it['sentence']),'label':['实体']*len(one[0])+['X']+label}
             # print(d)
             # print(len(d['text']),len(d['label']))
             # print(d)
             if len(d['text'])==len(d['label']):
                 one_kg_tmk=[]
                 for  t,tmk_l in zip(d['text'],d['label']):
                     # print(t,tmk_l)
                     one_kg_tmk.append((t,tmk_l))
                 if s>0:
                     # print(s)
                     # print(one_kg_tmk)
                     data.append(one_kg_tmk)
         except:
             # self.tdb.load("kg")
             continue
     # c=int(len(data)*0.85)
     # kgjson_t.save(data[:c])
     # kgjson_d.save(data[c:])
     print("总共生成数据",len(data))
     #自动处理重复标记问题
     # self.json_remove_duplicates("../tdata/kg_tmark/train.json")
     # self.json_remove_duplicates("../tdata/kg_tmark/dev.json")
     c=int(len(data)*0.7)
     b=int(len(data)*0.85)
     print(data[:10])
     print(len(data))
     train_data=data[:c]
     dev_data=data[c:b]
     test_data=data[b:]
     self.save_data(train_data,file="../tdata/kg_tmark/train.txt")
     self.save_data(dev_data,file="../tdata/kg_tmark/dev.txt")
     self.save_data(test_data,file="../tdata/kg_tmark/test.txt")
     self.save_labels(data,"../tdata/kg_tmark/labels.txt")
     print("已经将数据导出到 ../tdata/kg_tmark")
コード例 #24
0
    def save_to_json_ner(self):
        tkitFile.File().mkdir("../tdata/onlyner")
        kgjson_t=tkitFile.Json("../tdata/onlyner/train.json")
        kgjson_d=tkitFile.Json("../tdata/onlyner/dev.json")
        # kgjson_l=tkitFile.Json("../tdata/labels.json")
        self.tdb.load("kg_mark_unique_data")
        data=[]
        all_data_id=[]
        nlp_plus=tkitNlp.Plus()
        nlp_plus.load_tlp()
        flags={}
        for it in DB.kg_mark_unique_data.find():
            # print("k",k)
            k=it['_id']
            try: 
                # it=self.tdb.str_dict(v)
                text=it['sentence']
                # print("it",it)
                label= ["O"]*len(text)
                ner={}
                for one in it['kgs']:
                    # print(one)
                    # label,s1=self.mark_word_label(it['sentence'],label,one[0],"实体")
                    # label,s1=self.mark_word_label(it['sentence'],label,one[1],"关系")
                    try:
                        if one[1] not in ner[one[0]]:
                            ner[one[0]].append(one[1])
                    except:
                        ner[one[0]]=[one[1]]
                # print("++++++"*10)
                # print('text',text)                
                ner_list =[tmp for  tmp in ner.keys() ]
                # print('ner_list',ner_list)   
                # print(ner_list)
                # fner =[word for word,flag in nlp_plus.ner(text)]
                fner=[]
                for word,flag in nlp_plus.ner(text):
                    flags[flag]=0
                    fner.append(word)
                ner_list=list(set(ner_list+fner))
                ner_list = sorted(ner_list,key = lambda i:len(i),reverse=False)

                # print('ner_list',ner_list)
                s=0
                for nr in ner_list:
                
                    # print(nr)
                    label,s1=nlp_plus.mark_word_label(text,label,nr,"实体")
                    if s1>=0:
                        s=s+1
                if s>0:
                    one={'text':list(text),'label':label}
                    data.append(one)
                    
                    # print(flags)
            except:
                pass
        nlp_plus.release()
 
        c=int(len(data)*0.85)
        kgjson_t.save(data[:c])
        kgjson_d.save(data[c:])
        print("总共生成数据",len(data))
        #自动处理重复标记问题
        self.json_remove_duplicates("../tdata/onlyner/train.json")
        self.json_remove_duplicates("../tdata/onlyner/dev.json")
        print("已经将数据导出到 .../tdata/onlyner/")
コード例 #25
0
 def read_kg(self):
     kgjson=tkitFile.Json("../data/knowledge_triple.json")
     for item in kgjson.auto_load():
         # print(item)
         yield item
コード例 #26
0
def build_dataset_ner(train_file,type="all"):
    """
    百度训练集
    转化为标注数据集
    实体标注和关系词抽取训练集
    train_file 文件路径
    type="all" 或者mini 
    mini

    构建数据思路
    多个描述合并到一个训练里

    使用ner提取出句子中的实体

    文本: ner+句子
    label: ['K']*len(ner)+正常标记
    """
    tjson=tkitFile.Json(file_path=train_file)
    # all_save=Tjson(file_path="data/train_all.json")
    # tjson_save=Tjson(file_path="data/ner_train.json")
    # dev_json_save=Tjson(file_path="data/ner_dev.json")
    tjson_save=tkitFile.Json(file_path="../tdata/onlyner/train.json")
    dev_json_save=tkitFile.Json(file_path="../tdata/onlyner/dev.json")
    data=[]
    nlp_plus=tkitNlp.Plus()
    nlp_plus.load_tlp()
    flags={}
    for item in tqdm(tjson.load()):
        text= item['text']
        label= ["O"]*len(text)
        ner={}
        for n in item['spo_list']:
            try:
                ner[n['subject']].append(n['predicate'])
            except:
                ner[n['subject']]=[n['predicate']]
        # for  tmp in ner.keys():
        #     print(tmp)
        ner_list =[tmp for  tmp in ner.keys() ]
        # print(ner_list)
        # fner =[word for word,flag in nlp_plus.ner(text)]
        fner=[]
        for word,flag in nlp_plus.ner(text):
            flags[flag]=0
            fner.append(word)
        ner_list=list(set(ner_list+fner))
        ner_list = sorted(ner_list,key = lambda i:len(i),reverse=False)
        # print(ner_list)
        s=0
        for nr in ner_list:
           
            # print(nr)
            label,s1=nlp_plus.mark_word_label(text,label,nr,"实体")
            if s1>=0:
                s=s+1
            # for n in ner[nr]:
            #     label,s1=mark_word_label(text,label,n,"实体")
            #     if s1>=0:
            #         s=s+1
        if s>0:
            one={'text':list(text),'label':label}
            data.append(one)
            # print(one)
            # print(flags)

    nlp_plus.release()
    if type=="all":
        pass
    elif type=="mini":
        data=data[:200]
    # all_save.save(data)
    print("总共数据",len(data))
    f=int(len(data)*0.85)
    tjson_save.save(data=data[:f])
    dev_json_save.save(data=data[f:])
コード例 #27
0
def data_pre_train_mongo_next_sentence():
    """
    构建下一句语料
    from=0  #文章开始id
    limit=10 # 返回文章数目
    >>>data_pre_train(from=0, limit=10)


    """

    i = 0
    n = 0
    data = []
    parser = argparse.ArgumentParser()
    parser.add_argument('--limit',
                        default=50000,
                        type=int,
                        required=False,
                        help='长度限制')
    args = parser.parse_args()

    tt = tkitText.Text()
    data_json = tkitFile.Json(file_path='data.json')
    for it in data_json.auto_load():
        # print(it)
        sents = tt.sentence_segmentation_v1(it['content'])
        pre_sents = []
        for i, sent in enumerate(sents):
            if i == 0:
                one = {'sentence': it['title'], 'sentence_b': sent, 'label': 1}
                data.append(one)
                rand_sent = choice(sents)
                if rand_sent != sent:
                    one = {
                        'sentence': it['title'],
                        'sentence_b': rand_sent,
                        'label': 0
                    }
                data.append(one)

                pre_sents.append(it['title'])
                pre_sents.append(sent)
            else:
                pre_text = "".join(pre_sents)
                one = {
                    'sentence': pre_text[-200:],
                    'sentence_b': sent,
                    'label': 1
                }
                data.append(one)
                rand_sent = choice(sents)
                if rand_sent != sent:
                    one = {
                        'sentence': pre_text[-200:],
                        'sentence_b': rand_sent,
                        'label': 0
                    }
                data.append(one)
                pre_sents.append(sent)
            # print(len(data))
        if len(data) > args.limit:
            break
        if n % 10000 == 0:
            # print("保存10000")
            pass

        n = n + 1
    cut = int(len(data) * 0.8)
    save_data(data[:cut], path='data/', name="train.json")
    save_data(data[cut:], path='data/', name="dev.json")
    data = []
コード例 #28
0
    def save_to_json_ner_rel(self):
        tkitFile.File().mkdir("../tdata/ner")
        kgjson_t=tkitFile.Json("../tdata/ner_rel/train.json")
        kgjson_d=tkitFile.Json("../tdata/ner_rel/dev.json")
        # kgjson_l=tkitFile.Json("../tdata/labels.json")
        # self.tdb.load("kg_mark_unique_data")
        data=[]
        all_data_id=[]
        # for k,v in self.tdb.get_all():
        for it in DB.kg_mark_unique_data.find():
            # print("k",k)
            k=it['_id']
            try: 
                # it=self.tdb.str_dict(v)
                # print("it",it)
                
                ner={}
                for one in it['kgs']:
                    # print(one)
                    # label,s1=self.mark_word_label(it['sentence'],label,one[0],"实体")
                    # label,s1=self.mark_word_label(it['sentence'],label,one[1],"关系")
                    try:
                        if one[1] not in ner[one[0]]:
                            ner[one[0]].append(one[1])
                    except:
                        ner[one[0]]=[one[1]]
                # print(ner)
                
                for nr in ner:
                    s=0
                    label=['O']*len(it['sentence'])
                    # print(ner[nr])
                    for n in ner[nr]:
                        # print(n)
                        # print("label",label)
                        # print(it['sentence'])
                        label,s1=self.mark_word_label(it['sentence'],label,n,"关系")
                        # print(label,s1)
                        if s1>=0:
                            s=s+1
                    if s>0:
                        one_ner={'text':list(nr+'#'+it['sentence']),'label':['K']*len(nr)+['X']+label}
                        data.append(one_ner)
                        # print(one_ner)




                # # print(label)
                # d={'text':list(it['sentence']),'label':label}
                # # print(d)
                # # print(d)
                # data.append(d)
            except:
                # self.tdb.load("kg")
                continue
        c=int(len(data)*0.85)
        kgjson_t.save(data[:c])
        kgjson_d.save(data[c:])
        print("总共生成数据",len(data))
        #自动处理重复标记问题
        self.json_remove_duplicates("../tdata/ner_rel/train.json")
        self.json_remove_duplicates("../tdata/ner_rel/dev.json")
        print("已经将数据导出到 ../tdata/ner_rel")
コード例 #29
0
        elif flag.startswith("I-"):
            one.append(word)
        elif flag.startswith("E-"):
            one.append(word)
            words_list.append("".join(one))
        elif flag.startswith("S-"):
            words_list.append(word)
    # print(words_list)
    # return words_list,words, postags,netags
    return words_list


import tkitFile
tfile = tkitFile.File()

tj = tkitFile.Json(file_path='data/ner/dev.json')
i = 0
n = 0
f = 0
good = 0
all = 0
for item in tj.auto_load():
    print("###" * 20)
    o_ners = Ner_Marker.get_mark_data(item).get("实体")
    text = ''.join(item['text'])
    # result=TNer.pre([text])
    result = Ner_Marker.pre_ner(text)
    all = all + 1
    if o_ners == result:
        good = good + 1
        pass
コード例 #30
0
    def save_to_json(self):
        """
        可以用于测试知识是否是合理的
        """
        kgjson_t=tkitFile.Json("../tdata/kg_check/train.json")
        kgjson_d=tkitFile.Json("../tdata/kg_check/dev.json")
        # kgjson_l=tkitFile.Json("../tdata/kg_check/labels.json")
        # self.tdb.load("kg_mark")
        data=[]
        i=0
        n=0
        # self.tdb.load("kg_mark")
        tt=tkitText.Text()
        i=-1
        q={'check': True,'state':'2'}
        for it in DB.kg_mark.find(q):
        # for k,v in self.tdb.get_all():
            k=it["_id"]
            n=n+1
            # print(v)
            # if v==None:
            #     n += 1
            # else:
            try: 
                # it=self.tdb.str_dict(v)
                one={}
                # one['sentence']=" [kg] "+",".join(it['kg'])+" [/kg] "+it['sentence']
                one['sentence']=it['sentence']
                one['sentence_b']=",".join(it['kg'])
                one['label']=it['label']-1
                
                if int(one['label']) in [0,1] and len(it['kg'])==3  and it.get('check')!=None and it.get('state')=='2':
                    data.append(one)

                    for i,sentence in enumerate( it['kg']):
                        # print('111')
                        if i!=2:
                            continue

                        new=self.random_text_clip(sentence)
                        # print(new)
                        if new not in it['kg']:
                            new_one=it.copy()
                            # print(new_one)
                            new_one['kg'][i]=new
                            one={}
                            # one['sentence']=" [kg] "+",".join(new_one['kg'])+" [/kg] "+new_one['sentence']
                            one['sentence']=it['sentence']
                            one['sentence_b']=",".join(new_one['kg'])
                            one['label']=0
                            data.append(one)
                            # print('new_one',one)
                            
                    # for i in range(3):
                    #     # print('111')
                    #     new=self.random_text_clip(it['sentence'])
                    #     # print(new)
                    #     if new not in it['kg']:
                    #         new_one=it.copy()
                    #         # print(new_one)
                    #         new_one['kg'][i]=new
                    #         one={}
                    #         # one['sentence']=" [kg] "+",".join(new_one['kg'])+" [/kg] "+new_one['sentence']
                    #         one['sentence']=it['sentence']
                    #         one['sentence_b']=",".join(new_one['kg'])
                    #         one['label']=0
                    #         data.append(one)
                    #         print('new_one',one)


                else:
                    print(it)
                    pass
            except:
                # self.tdb.load("kg")
                continue
        c=int(len(data)*0.85)
        print("总数据",len(data),i,n)
        kgjson_t.save(data[:c])
        kgjson_d.save(data[c:])
        #自动处理重复标记问题
        self.json_remove_duplicates("../tdata/kg_check/train.json")
        self.json_remove_duplicates("../tdata/kg_check/dev.json")
        print("已经将数据导出到 ../tdata/kg_check")