コード例 #1
0
def get_anocontext_randmid(x):
    s = split_by_fullstop(x)
    if len(s) <= 2:
        return False
    sample_index = random.randint(1, len(s) - 2)
    s[sample_index] = '【' + s[sample_index] + '】'
    return ''.join(s)
コード例 #2
0
def get_anocontext_end(x):
    if x.high_end_bleu:
        return False
    else:
        s = split_by_fullstop(x.story)
        s[-1] = '【' + s[-1] + '】'
        return ''.join(s)
コード例 #3
0
def gen_samples():

    scorer = BERTScorer(lang="zh", rescale_with_baseline=True)
    data_file = '32-deduplicate-story.csv'
    df = pd.read_csv(data_file)
    # import pdb;pdb.set_trace()
    stories = list(df.story.dropna())
    stories_split = [split_by_fullstop(x) for x in stories]
    stories_split_select = [
        random.randint(0,
                       len(x) - 1) for x in stories_split
    ]
    stories_sentencesample = [
        x[y] for x, y in zip(stories_split, stories_split_select)
    ]
    stories_split_copy = copy.deepcopy(stories_split)
    stories_context = []
    for ss, sss in zip(stories_split_copy, stories_split_select):
        ss[sss] = '<MASK>'
        stories_context.append(ss)
    stories_context = [''.join(x) for x in stories_context]
    positive_samples = [
        (x, y, True) for x, y in zip(stories_context, stories_sentencesample)
    ]
    cands = stories_sentencesample
    assert len(cands) == len(stories_split)
    refs = []
    for i, cand in enumerate(cands):
        refs.append([
            x for j, y in enumerate(stories_split) for x in y
            if len(x) > 0 and j != i
        ])
    bestmatch = []
    print(len(cands))
    for i, (c, ref) in enumerate(zip(cands, refs)):
        print(i, 'th candidate...')
        cand = [c] * len(ref)
        import pdb
        pdb.set_trace()
        P, R, F1 = scorer.score(cand, ref)
        bestmatch.append(int(torch.argmax(R)))
    negative_samples = [(x, y[z], False)
                        for x, y, z in zip(stories_context, refs, bestmatch)]
    return [(x, w, y[z]) for x, y, z, w in zip(
        stories_context, refs, bestmatch, stories_sentencesample)]
コード例 #4
0
def gen_PC():
    vt = pd.read_csv('91-adjust_bias.csv')
    vt = vt[['context','True']]
    vt['story'] = vt.apply(lambda x: x['context'].replace('<MASK>',x['True']),axis = 1)
    df = pd.read_csv('32-deduplicate-story.csv')
    dict_sct = []
    for story in df.story:
        for s in split_by_fullstop(story):
            dict_sct.append({'story':story,'context':story.replace(s,'<MASK>'),'True':s })
    train= pd.DataFrame(dict_sct)
    train = train[~train.story.isin(vt['story'].tolist())]
    train[['context','True']].to_csv('99-dataset/PC/PC_train.csv')
    stories_in_vt = sorted(list(set(vt['story'].tolist())))
    random.shuffle(stories_in_vt)
    stories_in_valid = stories_in_vt[:int(len(stories_in_vt)*0.5)]
    stories_in_test = stories_in_vt[int(len(stories_in_vt)*0.5):]
    valid = vt[vt.story.isin(stories_in_valid)]
    test = vt[vt.story.isin(stories_in_test)]
    valid.to_csv('99-dataset/PC/PC_valid.csv')
    test.to_csv('99-dataset/PC/PC_test.csv')
コード例 #5
0
def get_sentence_len(s):
    return len(split_by_fullstop(s))
コード例 #6
0
from bert_score import BERTScorer
import random
import pandas as pd
from utils import split_by_fullstop
from tools import start_debugger_on_exception
import torch
start_debugger_on_exception()
scorer = BERTScorer(lang="zh", rescale_with_baseline=True)
data_file = 'annotateddata/batch1.csv'
df = pd.read_csv(data_file)
# import pdb;pdb.set_trace()
stories = list(df.RESULT.dropna())
stories_split = [split_by_fullstop(x) for x in stories]
refs_pre = [x for y in stories_split for x in y if len(x)>0]
stories_split_select = [random.randint(0,len(x)-1) for x in stories_split]
stories_sentencesample = [x[y] for x,y in zip(stories_split,stories_split_select)]

stories_context = [] 
for ss,sss in zip(stories_split,stories_split_select):
    ss[sss] = '<MASK>'
    stories_context.append(ss)
stories_context = [''.join(x) for x in stories_context]  
positive_samples = [(x,y,True) for x,y in zip(stories_context,stories_sentencesample)]
cands_pre = stories_sentencesample
len_refs = len(refs_pre)
len_cands = len(cands_pre)
cands = [x for x in cands_pre for i in range(len_refs)]

refs = refs_pre*len_cands
# print(refs)
# print(cands)
コード例 #7
0
import pandas as pd
from utils import split_by_pair,split_by_fullstop
df = pd.read_csv('annotateddata/05-14anticomm-top500.csv')

rest = df.iloc[498:]
rest = rest['标题,给定的故事,REJECT,找出其中可以使用常识推断的句子,将标出的句子改写为一个与上下文内容相关但违反常识的句子,改写策略'.split(',')]
rest['temp'] = rest['给定的故事'].apply(lambda x: split_by_pair(x,left = '【',right = '】'))
rest['temp0'] = rest['temp'].apply(lambda x: split_by_fullstop(x[0]))
rest['temp2'] = rest['temp'].apply(lambda x: split_by_fullstop(x[2]))
rest['temp'] = rest['temp'].apply(lambda x: [x[1]])
rest['temp'] = rest.apply(lambda x: [k for k in x.temp0+x.temp+x.temp2 if len(k)>0],axis = 1)
rest['temp'] = rest['temp'].apply(lambda x: ''.join(['{'+str(i)+'}'+ k for i,k in enumerate(x)]))
rest['给定的故事'] = rest['temp']
rest = rest['标题,给定的故事,REJECT,找出其中可以使用常识推断的句子,将标出的句子改写为一个与上下文内容相关但违反常识的句子,改写策略'.split(',')]
rest.to_csv('data/sct_tobeanoed_after498_modify.csv',encoding="utf_8_sig")
import pdb;pdb.set_trace()

コード例 #8
0
import nltk
import pandas as pd
# from tokenizer import SimpleTokenizer
from utils import split_by_fullstop
# tokenizer = SimpleTokenizer(method="nltk")
r = pd.read_csv('32-deduplicate-story.csv',encoding="utf_8_sig")
result_list = r.story.values.tolist()
result_list = [x.strip() for x in result_list]
endings = [split_by_fullstop(x)[-1] for x in result_list]

s_bleu = []
# for k in [1,2,3,4,5,6,7,8]:
#     s_bleu[k]=[]
for i,ending in enumerate(endings):
    print(i)
    refs = [result_list[j] for j in range(len(result_list)) if j != i]
    refs = [x for y in refs for x in split_by_fullstop(y)]
    references = [list(x) for x in refs]
    hypothesis = list(ending)
    bs = nltk.translate.bleu_score.sentence_bleu(references, hypothesis)
    # for k in [1,2,3,4,5,6,7,8]:
    s_bleu.append(bs)
    print('bs:',bs)
r['ending_bleu'] = s_bleu
r['ending'] = endings
r.to_csv('33-get-ending-bleu.csv',encoding="utf_8_sig")
import pdb;pdb.set_trace()