コード例 #1
0
def add_sentence_one(text):
    """添加一条数据"""
    tfile =  tkit.File()
    t_text=tkit.Text()
    print(text)

    p=re.compile(r'[##del##](.*?)[##del##]',re.S)
    words= re.findall(p, text)
    # 获取标记后的关键词
    words = remove_null(words)
    # s=re.split(r'##del##',text)
    # text= ''.join(s)
    text= text.replace("##del##", "")

    # for word in words:
    seg_list=jieba_seg_list(text)
    print(seg_list)
    for i,item in enumerate(seg_list):
        print(item)
        s=jieba_seg_list(text)
        s2=jieba_seg_list(text) #第二种
        calculate='No'
        while item in words:
            # s=re.split(r'##del##',text)
            # print('s',s)
            words.remove(item)
            calculate='Yes'
        # if calculate=='Yes':
        
        # else:
        # new= ch_one(item,s)
        s[i]="##del##"+item+"##del##"
        s2[i]="###" #第二种
        # del(s)
        # print(new)
        data={
            'label':calculate,
            'sentence':''.join(s)

        }
        print(data)
        if calculate=='No':
            # 未标记数据
            libs.Terry().c_inputfile('/home/terry/pan/github/Bert-Sentence-streamlining/Bert-Sentence-streamlining/data/train.json',data)
            pass
        else:
            libs.Terry().c_inputfile('/home/terry/pan/github/Bert-Sentence-streamlining/Bert-Sentence-streamlining/data/train.json',data)

        data2={
            'label':calculate,
            'sentence':''.join(s2)

        }
        print(data)
        libs.Terry().c_inputfile('/home/terry/pan/github/Bert-Sentence-streamlining/Bert-Sentence-streamlining/data/train_v2.json',data2)
コード例 #2
0
ファイル: fun.py プロジェクト: newszeng/ai_writer
def random_sentence_one():
    """
    随机生成一条未标记数据
    """
    # 获取未处理的数据
    tfile =  tkit.File()
    t_text=tkit.Text()
    # file_path="/home/terry/pan/github/ai_writer/ai_writer/data/kw2text_mini/"
    file_path="/home/terry/pan/github/ai_writer/ai_writer/data/kw2text/"
    file_list=tfile.file_List(file_path)
    f = choice(file_list)
    text = tfile.open_file(f)

    text_array= t_text.sentence_segmentation(text)
    sentence_one = choice(text_array)
    # sentence_one
    #对句子进行分词
    seg_list=jieba_seg_list(sentence_one)

    # word = choice(seg_list)
    seg_list_len = len(seg_list)
    #判断句子是否是过短 过短则忽略
    if seg_list_len> 3:
        # 随机整数:
        n = random.randint(0,seg_list_len-1)

        # seg_list
        # print(sentence_one)
        # print(seg_list)

        # a = ''.join(seg_list[0:n])
        # b = ''.join(seg_list[n])
        # if n ==seg_list_len:
        #     c = ''
        # else:
        #     c = ''.join(seg_list[n+1:seg_list_len])

        # full = a +'##del##'+b+'##del##'+c
        seg_list[n]= '##del##'+seg_list[n]+'##del##'
        full=''.join(seg_list)
        calculate='No'

        data={
            'label':calculate,
            'sentence':full

        }
        print(data)
        libs.Terry().c_inputfile('corpus.json',data)
コード例 #3
0
ファイル: fun.py プロジェクト: newszeng/ai_writer
def add_sentence_one(text):
    """添加一条数据"""
    tfile =  tkit.File()
    t_text=tkit.Text()
    print(text)

    p=re.compile(r'[##del##](.*?)[##del##]',re.S)
    words= re.findall(p, text)
    # 获取标记后的关键词
    words = remove_null(words)
    ls= replace_mark(words,text)
    mark_list= ls['mark']
    # if len(ls['unmark'])>len(ls['mark']):
    #     unmark_list=sample(ls['unmark'], len(ls['mark']))
    # else:
    #     unmark_list=ls['unmark']
    unmark_list=ls['unmark']
    new_list=mark_list+unmark_list
    for item in new_list:
        libs.Terry().c_inputfile('/home/terry/github/ai_writer/ai_writer/data/mark/train.json',item)
コード例 #4
0
ファイル: fun.py プロジェクト: newszeng/ai_writer
def add_sentence_one_unmark(text):
    """添加一条未标记的数据"""
    tfile =  tkit.File()
    t_text=tkit.Text()
    print(text)

    p=re.compile(r'[##del##](.*?)[##del##]',re.S)
    words= re.findall(p, text)
    # 获取标记后的关键词
    words = remove_null(words)
    # s=re.split(r'##del##',text)
    # text= ''.join(s)
    text= text.replace("##del##", "")

    # for word in words:
    seg_list=jieba_seg_list(text)
    print(seg_list)
    seg_list_unmark=seg_list
    print('words',words)
    for word in words:
        try:
            seg_list_unmark.remove(word)
        except:
            print('无法移除',word)
            pass
    

    seg_list_unmark_mini=sample(seg_list_unmark, len(words)) # 随机抽取和已经标记相同数目的样本
    print('seg_list_unmark_mini',seg_list_unmark_mini)
    for i,item in enumerate(seg_list):
        # print(item)
        s=jieba_seg_list(text)
        s2=jieba_seg_list(text) #第二种
        calculate='ignore'

        #这里是抽取未标记的生成
        while item in seg_list_unmark_mini:
            # s=re.split(r'##del##',text)
            # print('s',s)
            seg_list_unmark_mini.remove(item)
            calculate='No'

        # while item in words:
        #     # s=re.split(r'##del##',text)
        #     # print('s',s)
        #     words.remove(item)
        #     calculate='Yes'
        # # if calculate=='Yes':

        # else:
        # new= ch_one(item,s)
        s[i]="##del##"+item+"##del##"
        s2[i]="###" #第二种
        # del(s)
        # print(new)
        data={
            'label':calculate,
            'sentence':''.join(s)

        }
        data2={
            'label':calculate,
            'sentence':''.join(s2)

        }
        # print(data)
        if calculate=='ignore' :
            # 未选择忽略
            # libs.Terry().c_inputfile('/home/terry/github/ai_writer/ai_writer/data/mark/train.json',data)
            pass
        else:
            libs.Terry().c_inputfile('/home/terry/github/ai_writer/ai_writer/data/mark/train.json',data)
            # print(data)
            libs.Terry().c_inputfile('/home/terry/github/ai_writer/ai_writer/data/mark/train_v2.json',data2)