def add_sentence_one(text): """添加一条数据""" tfile = tkit.File() t_text=tkit.Text() print(text) p=re.compile(r'[##del##](.*?)[##del##]',re.S) words= re.findall(p, text) # 获取标记后的关键词 words = remove_null(words) # s=re.split(r'##del##',text) # text= ''.join(s) text= text.replace("##del##", "") # for word in words: seg_list=jieba_seg_list(text) print(seg_list) for i,item in enumerate(seg_list): print(item) s=jieba_seg_list(text) s2=jieba_seg_list(text) #第二种 calculate='No' while item in words: # s=re.split(r'##del##',text) # print('s',s) words.remove(item) calculate='Yes' # if calculate=='Yes': # else: # new= ch_one(item,s) s[i]="##del##"+item+"##del##" s2[i]="###" #第二种 # del(s) # print(new) data={ 'label':calculate, 'sentence':''.join(s) } print(data) if calculate=='No': # 未标记数据 libs.Terry().c_inputfile('/home/terry/pan/github/Bert-Sentence-streamlining/Bert-Sentence-streamlining/data/train.json',data) pass else: libs.Terry().c_inputfile('/home/terry/pan/github/Bert-Sentence-streamlining/Bert-Sentence-streamlining/data/train.json',data) data2={ 'label':calculate, 'sentence':''.join(s2) } print(data) libs.Terry().c_inputfile('/home/terry/pan/github/Bert-Sentence-streamlining/Bert-Sentence-streamlining/data/train_v2.json',data2)
def random_sentence_one(): """ 随机生成一条未标记数据 """ # 获取未处理的数据 tfile = tkit.File() t_text=tkit.Text() # file_path="/home/terry/pan/github/ai_writer/ai_writer/data/kw2text_mini/" file_path="/home/terry/pan/github/ai_writer/ai_writer/data/kw2text/" file_list=tfile.file_List(file_path) f = choice(file_list) text = tfile.open_file(f) text_array= t_text.sentence_segmentation(text) sentence_one = choice(text_array) # sentence_one #对句子进行分词 seg_list=jieba_seg_list(sentence_one) # word = choice(seg_list) seg_list_len = len(seg_list) #判断句子是否是过短 过短则忽略 if seg_list_len> 3: # 随机整数: n = random.randint(0,seg_list_len-1) # seg_list # print(sentence_one) # print(seg_list) # a = ''.join(seg_list[0:n]) # b = ''.join(seg_list[n]) # if n ==seg_list_len: # c = '' # else: # c = ''.join(seg_list[n+1:seg_list_len]) # full = a +'##del##'+b+'##del##'+c seg_list[n]= '##del##'+seg_list[n]+'##del##' full=''.join(seg_list) calculate='No' data={ 'label':calculate, 'sentence':full } print(data) libs.Terry().c_inputfile('corpus.json',data)
def add_sentence_one(text): """添加一条数据""" tfile = tkit.File() t_text=tkit.Text() print(text) p=re.compile(r'[##del##](.*?)[##del##]',re.S) words= re.findall(p, text) # 获取标记后的关键词 words = remove_null(words) ls= replace_mark(words,text) mark_list= ls['mark'] # if len(ls['unmark'])>len(ls['mark']): # unmark_list=sample(ls['unmark'], len(ls['mark'])) # else: # unmark_list=ls['unmark'] unmark_list=ls['unmark'] new_list=mark_list+unmark_list for item in new_list: libs.Terry().c_inputfile('/home/terry/github/ai_writer/ai_writer/data/mark/train.json',item)
def add_sentence_one_unmark(text): """添加一条未标记的数据""" tfile = tkit.File() t_text=tkit.Text() print(text) p=re.compile(r'[##del##](.*?)[##del##]',re.S) words= re.findall(p, text) # 获取标记后的关键词 words = remove_null(words) # s=re.split(r'##del##',text) # text= ''.join(s) text= text.replace("##del##", "") # for word in words: seg_list=jieba_seg_list(text) print(seg_list) seg_list_unmark=seg_list print('words',words) for word in words: try: seg_list_unmark.remove(word) except: print('无法移除',word) pass seg_list_unmark_mini=sample(seg_list_unmark, len(words)) # 随机抽取和已经标记相同数目的样本 print('seg_list_unmark_mini',seg_list_unmark_mini) for i,item in enumerate(seg_list): # print(item) s=jieba_seg_list(text) s2=jieba_seg_list(text) #第二种 calculate='ignore' #这里是抽取未标记的生成 while item in seg_list_unmark_mini: # s=re.split(r'##del##',text) # print('s',s) seg_list_unmark_mini.remove(item) calculate='No' # while item in words: # # s=re.split(r'##del##',text) # # print('s',s) # words.remove(item) # calculate='Yes' # # if calculate=='Yes': # else: # new= ch_one(item,s) s[i]="##del##"+item+"##del##" s2[i]="###" #第二种 # del(s) # print(new) data={ 'label':calculate, 'sentence':''.join(s) } data2={ 'label':calculate, 'sentence':''.join(s2) } # print(data) if calculate=='ignore' : # 未选择忽略 # libs.Terry().c_inputfile('/home/terry/github/ai_writer/ai_writer/data/mark/train.json',data) pass else: libs.Terry().c_inputfile('/home/terry/github/ai_writer/ai_writer/data/mark/train.json',data) # print(data) libs.Terry().c_inputfile('/home/terry/github/ai_writer/ai_writer/data/mark/train_v2.json',data2)