def change_first_name(df: pd.DataFrame, orig_index: int, tokens_orig: list): """ Change first name in sentence if one exists and if name is in CheckList's name lookup json :param df: DataFrame containing sentences :param orig_index: integer indicating the column index of the original sentence :param tokens_orig: tokenised version of sentence :return: None. Modifies DataFrame in-place """ new_column_tokens, new_column_concat, new_column_success_flag, new_column_pert_indices = _gen_empty_columns( ) # Checklist requires pre-processing with Spacy for this perturbation pdata = list(nlp.pipe(df.iloc[:, orig_index])) for s in range(len(tokens_orig)): sentence = tokens_orig[s] new_sentence = Perturb.change_names(pdata[s], n=1, first_only=True, meta=True) if not new_sentence: new_column_tokens.append(sentence) new_column_concat.append(df.iloc[s, orig_index]) new_column_success_flag.append(0) new_column_pert_indices.append(None) else: # extract token that has been perturbed token_pert = new_sentence[1][0][0] # verify that perturbed name appears as token in the original input if token_pert in sentence: # obtain index token_index = sentence.index(token_pert) new_sentence_tokens = deepcopy(sentence) # replace token new_sentence_tokens[token_index] = new_sentence[1][0][1] # update columns new_column_tokens.append(new_sentence_tokens) new_column_concat.append(new_sentence[0][0]) new_column_success_flag.append(1) new_column_pert_indices.append([token_index]) # if token cannot be found in original list of tokens else: new_column_tokens.append(sentence) new_column_concat.append(df.iloc[s, orig_index]) new_column_success_flag.append(0) new_column_pert_indices.append(None) df['change_first_name_concat'] = new_column_concat df['change_first_name_tokens'] = new_column_tokens df['change_first_name_success'] = new_column_success_flag df['change_first_name_pert_ind'] = new_column_pert_indices
from pathlib import Path import pandas as pd from checklist.perturb import Perturb import spacy filename = "contains1.txt" base = Path('SST-2') sents = base / 'datasetSentences.txt' split = base / 'datasetSplit.txt' df = pd.read_table(sents) df = df.join(pd.read_csv(split).set_index('sentence_index'), on='sentence_index') seeds = df[df['splitset_label'] == 2]['sentence'].values.tolist() # only use sentence have label !=3 filter_seed = open(filename, "w", encoding='utf-8') nlp = spacy.load('en_core_web_sm') pdataset = list(nlp.pipe(seeds)) for i in range(0, len(pdataset)): trans1 = Perturb.change_names(pdataset[i]) trans2 = Perturb.change_location(pdataset[i]) trans3 = Perturb.change_number(pdataset[i]) # if ((trans1!=None and trans3 != None)or (trans2 != None and trans3 != None)or (trans1!=None and trans2 != None)): if (trans3 != None or trans2 != None or trans1 != None): filter_seed.write(seeds[i]) filter_seed.write("\n") filter_seed.close()