def get_anocontext_randmid(x): s = split_by_fullstop(x) if len(s) <= 2: return False sample_index = random.randint(1, len(s) - 2) s[sample_index] = '【' + s[sample_index] + '】' return ''.join(s)
def get_anocontext_end(x): if x.high_end_bleu: return False else: s = split_by_fullstop(x.story) s[-1] = '【' + s[-1] + '】' return ''.join(s)
def gen_samples(): scorer = BERTScorer(lang="zh", rescale_with_baseline=True) data_file = '32-deduplicate-story.csv' df = pd.read_csv(data_file) # import pdb;pdb.set_trace() stories = list(df.story.dropna()) stories_split = [split_by_fullstop(x) for x in stories] stories_split_select = [ random.randint(0, len(x) - 1) for x in stories_split ] stories_sentencesample = [ x[y] for x, y in zip(stories_split, stories_split_select) ] stories_split_copy = copy.deepcopy(stories_split) stories_context = [] for ss, sss in zip(stories_split_copy, stories_split_select): ss[sss] = '<MASK>' stories_context.append(ss) stories_context = [''.join(x) for x in stories_context] positive_samples = [ (x, y, True) for x, y in zip(stories_context, stories_sentencesample) ] cands = stories_sentencesample assert len(cands) == len(stories_split) refs = [] for i, cand in enumerate(cands): refs.append([ x for j, y in enumerate(stories_split) for x in y if len(x) > 0 and j != i ]) bestmatch = [] print(len(cands)) for i, (c, ref) in enumerate(zip(cands, refs)): print(i, 'th candidate...') cand = [c] * len(ref) import pdb pdb.set_trace() P, R, F1 = scorer.score(cand, ref) bestmatch.append(int(torch.argmax(R))) negative_samples = [(x, y[z], False) for x, y, z in zip(stories_context, refs, bestmatch)] return [(x, w, y[z]) for x, y, z, w in zip( stories_context, refs, bestmatch, stories_sentencesample)]
def gen_PC(): vt = pd.read_csv('91-adjust_bias.csv') vt = vt[['context','True']] vt['story'] = vt.apply(lambda x: x['context'].replace('<MASK>',x['True']),axis = 1) df = pd.read_csv('32-deduplicate-story.csv') dict_sct = [] for story in df.story: for s in split_by_fullstop(story): dict_sct.append({'story':story,'context':story.replace(s,'<MASK>'),'True':s }) train= pd.DataFrame(dict_sct) train = train[~train.story.isin(vt['story'].tolist())] train[['context','True']].to_csv('99-dataset/PC/PC_train.csv') stories_in_vt = sorted(list(set(vt['story'].tolist()))) random.shuffle(stories_in_vt) stories_in_valid = stories_in_vt[:int(len(stories_in_vt)*0.5)] stories_in_test = stories_in_vt[int(len(stories_in_vt)*0.5):] valid = vt[vt.story.isin(stories_in_valid)] test = vt[vt.story.isin(stories_in_test)] valid.to_csv('99-dataset/PC/PC_valid.csv') test.to_csv('99-dataset/PC/PC_test.csv')
def get_sentence_len(s): return len(split_by_fullstop(s))
from bert_score import BERTScorer import random import pandas as pd from utils import split_by_fullstop from tools import start_debugger_on_exception import torch start_debugger_on_exception() scorer = BERTScorer(lang="zh", rescale_with_baseline=True) data_file = 'annotateddata/batch1.csv' df = pd.read_csv(data_file) # import pdb;pdb.set_trace() stories = list(df.RESULT.dropna()) stories_split = [split_by_fullstop(x) for x in stories] refs_pre = [x for y in stories_split for x in y if len(x)>0] stories_split_select = [random.randint(0,len(x)-1) for x in stories_split] stories_sentencesample = [x[y] for x,y in zip(stories_split,stories_split_select)] stories_context = [] for ss,sss in zip(stories_split,stories_split_select): ss[sss] = '<MASK>' stories_context.append(ss) stories_context = [''.join(x) for x in stories_context] positive_samples = [(x,y,True) for x,y in zip(stories_context,stories_sentencesample)] cands_pre = stories_sentencesample len_refs = len(refs_pre) len_cands = len(cands_pre) cands = [x for x in cands_pre for i in range(len_refs)] refs = refs_pre*len_cands # print(refs) # print(cands)
import pandas as pd from utils import split_by_pair,split_by_fullstop df = pd.read_csv('annotateddata/05-14anticomm-top500.csv') rest = df.iloc[498:] rest = rest['标题,给定的故事,REJECT,找出其中可以使用常识推断的句子,将标出的句子改写为一个与上下文内容相关但违反常识的句子,改写策略'.split(',')] rest['temp'] = rest['给定的故事'].apply(lambda x: split_by_pair(x,left = '【',right = '】')) rest['temp0'] = rest['temp'].apply(lambda x: split_by_fullstop(x[0])) rest['temp2'] = rest['temp'].apply(lambda x: split_by_fullstop(x[2])) rest['temp'] = rest['temp'].apply(lambda x: [x[1]]) rest['temp'] = rest.apply(lambda x: [k for k in x.temp0+x.temp+x.temp2 if len(k)>0],axis = 1) rest['temp'] = rest['temp'].apply(lambda x: ''.join(['{'+str(i)+'}'+ k for i,k in enumerate(x)])) rest['给定的故事'] = rest['temp'] rest = rest['标题,给定的故事,REJECT,找出其中可以使用常识推断的句子,将标出的句子改写为一个与上下文内容相关但违反常识的句子,改写策略'.split(',')] rest.to_csv('data/sct_tobeanoed_after498_modify.csv',encoding="utf_8_sig") import pdb;pdb.set_trace()
import nltk import pandas as pd # from tokenizer import SimpleTokenizer from utils import split_by_fullstop # tokenizer = SimpleTokenizer(method="nltk") r = pd.read_csv('32-deduplicate-story.csv',encoding="utf_8_sig") result_list = r.story.values.tolist() result_list = [x.strip() for x in result_list] endings = [split_by_fullstop(x)[-1] for x in result_list] s_bleu = [] # for k in [1,2,3,4,5,6,7,8]: # s_bleu[k]=[] for i,ending in enumerate(endings): print(i) refs = [result_list[j] for j in range(len(result_list)) if j != i] refs = [x for y in refs for x in split_by_fullstop(y)] references = [list(x) for x in refs] hypothesis = list(ending) bs = nltk.translate.bleu_score.sentence_bleu(references, hypothesis) # for k in [1,2,3,4,5,6,7,8]: s_bleu.append(bs) print('bs:',bs) r['ending_bleu'] = s_bleu r['ending'] = endings r.to_csv('33-get-ending-bleu.csv',encoding="utf_8_sig") import pdb;pdb.set_trace()