Esempio n. 1
0
def change_location(df: pd.DataFrame, orig_index: int, tokens_orig: list):
    """
    Change location in sentence if one exists and if location is in CheckList's location lookup json

    :param df: DataFrame containing sentences
    :param orig_index: integer indicating the column index of the original sentence
    :param tokens_orig: tokenised version of sentence
    :return: None. Modifies DataFrame in-place
    """

    new_column_tokens, new_column_concat, new_column_success_flag, new_column_pert_indices = _gen_empty_columns(
    )
    # Checklist requires pre-processing with Spacy for this perturbation
    pdata = list(nlp.pipe(df.iloc[:, orig_index]))

    for s in range(len(tokens_orig)):
        sentence = tokens_orig[s]
        new_sentence = Perturb.change_location(pdata[s], n=1, meta=True)
        if not new_sentence:
            new_column_tokens.append(sentence)
            new_column_concat.append(df.iloc[s, orig_index])
            new_column_success_flag.append(0)
            new_column_pert_indices.append(None)
        else:
            # extract token that has been perturbed
            token_pert = new_sentence[1][0][0]
            # verify that perturbed name appears as token in the original input
            if token_pert in sentence:
                # obtain index
                token_index = sentence.index(token_pert)
                new_sentence_tokens = deepcopy(sentence)
                new_sentence_tokens[token_index] = new_sentence[1][0][1]
                new_column_tokens.append(new_sentence_tokens)
                new_column_concat.append(new_sentence[0][0])
                new_column_success_flag.append(1)
                new_column_pert_indices.append([token_index])
            # if token cannot be found in original list of tokens
            else:
                new_column_tokens.append(sentence)
                new_column_concat.append(df.iloc[s, orig_index])
                new_column_success_flag.append(0)
                new_column_pert_indices.append(None)

    df['change_location_concat'] = new_column_concat
    df['change_location_tokens'] = new_column_tokens
    df['change_location_success'] = new_column_success_flag
    df['change_location_pert_ind'] = new_column_pert_indices
Esempio n. 2
0
from pathlib import Path
import pandas as pd
from checklist.perturb import Perturb
import spacy

filename = "contains1.txt"
base = Path('SST-2')
sents = base / 'datasetSentences.txt'
split = base / 'datasetSplit.txt'
df = pd.read_table(sents)
df = df.join(pd.read_csv(split).set_index('sentence_index'),
             on='sentence_index')
seeds = df[df['splitset_label'] == 2]['sentence'].values.tolist()

# only use sentence have label !=3
filter_seed = open(filename, "w", encoding='utf-8')
nlp = spacy.load('en_core_web_sm')

pdataset = list(nlp.pipe(seeds))
for i in range(0, len(pdataset)):
    trans1 = Perturb.change_names(pdataset[i])
    trans2 = Perturb.change_location(pdataset[i])
    trans3 = Perturb.change_number(pdataset[i])
    # if ((trans1!=None and trans3 != None)or (trans2 != None and trans3 != None)or (trans1!=None and trans2 != None)):
    if (trans3 != None or trans2 != None or trans1 != None):
        filter_seed.write(seeds[i])
        filter_seed.write("\n")
filter_seed.close()