Example #1
0
def save_augmented_dataset(generated_sentences, n_generated, train_path,
                           output_dir):
    dataset = read_csv(train_path)
    for s, l, d, i in zip(generated_sentences['utterances'],
                          generated_sentences['labellings'],
                          generated_sentences['delexicalised'],
                          generated_sentences['intents']):
        dataset.append([s, l, d, i])
    augmented_path = output_dir / Path(
        train_path.name.replace('.csv', '_aug_{}.csv'.format(n_generated)))
    write_csv(dataset, augmented_path)
Example #2
0
    def build_data_files(self,
                         dataset_folder,
                         dataset_size=None,
                         restrict_intents=None,
                         none_folder=None,
                         none_size=None,
                         none_intents=None,
                         none_idx=None,
                         infersent_selection="no_infersent_selection",
                         cosine_threshold=None,
                         output_folder=None,
                         skip_header=True):

        original_train_path = dataset_folder / 'train.csv'
        original_test_path = dataset_folder / 'validate.csv'

        new_train = read_csv(original_train_path)
        new_test = read_csv(original_test_path)

        if skip_header:
            header_train = new_train[0]
            header_test = new_test[0]
            new_train = new_train[1:]
            new_test = new_test[1:]

        # filter intents
        filter_prefix = ''
        if restrict_intents is not None:
            filter_prefix = '_filtered'
            new_train = self.filter_intents(new_train, restrict_intents)
            new_test = self.filter_intents(new_test, restrict_intents)

        # trim_dataset
        trim_prefix = ''
        if dataset_size is not None:
            trim_prefix = '_{}'.format(dataset_size)
            original_dataset_size = len(new_train)
            keep_fraction = dataset_size / original_dataset_size
            intents = self.get_intents(new_train)
            sss = StratifiedShuffleSplit(n_splits=1,
                                         test_size=1 - keep_fraction)
            keep_indices = list(sss.split(intents, intents))[0][0]
            new_train = [new_train[i] for i in keep_indices]
            # new_train = random.sample(new_train, dataset_size)

        # add nones
        train_none_prefix = ''
        test_none_prefix = ''
        if none_size is not None:
            train_none_prefix = '_none_{}'.format(none_size)
            test_none_prefix = '_with_none'
            pseudolabels = None
            if infersent_selection != NO_INFERSENT_SELECTION:
                assert (none_intents is None)
                none_intents, pseudolabels = self.select_none_intents(
                    new_train, restrict_intents, none_folder, cosine_threshold)
                if infersent_selection == 'unsupervised':
                    pseudolabels = None  # ignore pseudolabels
            new_train = self.add_nones(new_train,
                                       none_folder,
                                       none_size=none_size,
                                       none_intents=none_intents,
                                       pseudolabels=pseudolabels,
                                       none_idx=none_idx)
            new_test = self.add_nones(new_test,
                                      none_folder,
                                      none_size=200,
                                      none_intents=none_intents,
                                      none_idx=none_idx)

        if output_folder is not None:
            new_train_path = output_folder / 'train{}{}{}.csv'.format(
                trim_prefix, train_none_prefix, filter_prefix)
            new_test_path = output_folder / 'validate{}{}.csv'.format(
                test_none_prefix, filter_prefix)
        else:
            new_train_path = dataset_folder / 'train{}{}{}.csv'.format(
                trim_prefix, train_none_prefix, filter_prefix)
            new_test_path = dataset_folder / 'validate{}{}.csv'.format(
                test_none_prefix, filter_prefix)

        if skip_header:
            new_train = [header_train] + new_train
            new_test = [header_test] + new_test

        write_csv(new_test, new_test_path)
        write_csv(new_train, new_train_path)

        return new_train_path, new_test_path
Example #3
0
from automatic_data_generation.utils.io import read_csv, write_csv
from pathlib import Path
from sklearn.model_selection import StratifiedShuffleSplit

data = read_csv(Path('data.csv'))
header = [data[0]]
data = data[1:]

intents = [row[3] for row in data]

test_fraction = 0.2
test_size = int(test_fraction * len(data))

sss = StratifiedShuffleSplit(n_splits=1,
                             test_size=test_size)

train_indices, test_indices = list(sss.split(intents, intents))[0]

train = header + [data[i] for i in train_indices]
validate = header + [data[i] for i in test_indices]

write_csv(validate, Path('validate.csv'))
write_csv(train, Path('train.csv'))

Example #4
0
def json2csv(datadir, outdir, samples_per_intent):
    print('Starting json2csv conversion...')
    punctuation = [',', '.', ';', '?', '!', '\"']
    data_folder = Path(datadir)
    out_folder = Path(outdir)

    for split in ['train', 'validate']:
        data_dict = {}
        for intent_dir in data_folder.iterdir():
            if not intent_dir.is_dir():
                continue
            intent = intent_dir.stem
            suffix = '{}_{}{}.json'.format(
                split, intent, '_full' if split == 'train' else ''
            )
            data_dict[intent] = load_json(intent_dir / suffix,
                                          encoding='latin1')[intent]

        slotdic = {}
        csv_data = [['utterance', 'labels', 'delexicalised', 'intent']]
        for intent, data in data_dict.items():
            for isent, sentence in enumerate(data):
                if isent >= samples_per_intent:
                    break
                utterance = ''
                labelling = ''
                delexicalised = ''

                for group in sentence['data']:
                    words = group['text']
                    try:
                        words = words.encode('latin-1').decode('utf8')
                    except (UnicodeDecodeError, UnicodeEncodeError):
                        if 'entity' not in group.keys():
                            print("skipping because of bad encoding:{}".format(
                                words))
                            continue
                        else:
                            words = words.encode('utf8').decode('utf8')

                    if remove_punctuation:
                        for p in punctuation:
                            words = words.replace(p, '')
                    words = words.replace('\n', '')  # trailing new lines are
                    # misread by csv writer
                    utterance += words

                    if 'entity' in group.keys():  # this group is a slot
                        slot = group['entity'].lower()
                        if remove_punctuation:
                            for p in punctuation:
                                slot = slot.replace(p, '')

                        delexicalised += '_' + slot + '_'
                        for i, word in enumerate(word_tokenize(words)):
                            # if word == '':
                            #     continue
                            if i == 0:
                                word = 'B-' + slot + ' '
                            else:
                                word = 'I-' + slot + ' '
                            labelling += word

                        if slot not in slotdic.keys():
                            slotdic[slot] = [words]
                        else:
                            if words not in slotdic[slot]:
                                slotdic[slot].append(words)

                    else:  # this group is just context
                        delexicalised += words
                        labelling += 'O ' * len(word_tokenize(words))

                csv_data.append([utterance, labelling, delexicalised, intent])

        output_file = out_folder / '{}.csv'.format(split)
        write_csv(csv_data, output_file)

        output_pickle_file = out_folder / '{}_slot_values.pkl'.format(split)
        with open(output_pickle_file, 'wb') as f:
            print(slotdic.keys())
            pickle.dump(slotdic, f)
            print('Dumped slot dictionnary')
    print('Example : ')
    print('Original utterance : ', utterance)
    print('Labelled : ', labelling)
    print('Delexicalised : ', delexicalised)

    print('Successfully converted json2csv !')
Example #5
0
def new_json2csv(datadir, outdir):
    data_folder = Path(datadir)
    out_folder = Path(outdir)
    data = load_json(data_folder / 'dataset.json', encoding='latin1')
    remove_punctuation = True
    punctuation = [',', '.', ';', '?', '!', '\"']

    val_fraction = 0.2

    for split in ['train', 'validate']:

        csv_data = [['utterance', 'labels', 'delexicalised', 'intent']]

        for intent in data['intents'].keys():

            num_val_sentences = int(
                val_fraction * len(data['intents'][intent]['utterances']))
            print(split, intent, num_val_sentences)
            if split == 'validate':
                sentences = data['intents'][intent]['utterances'][
                            :num_val_sentences]
            else:
                sentences = data['intents'][intent]['utterances'][
                            num_val_sentences:]
            print(len(sentences))

            for sentence in sentences:

                utterance = ''
                labelling = ''
                delexicalised = ''

                for group in sentence['data']:
                    words = group['text']
                    try:
                        words = words.encode('latin-1').decode('utf8')
                    except (UnicodeDecodeError, UnicodeEncodeError):
                        if 'entity' not in group.keys():
                            print("skipping because of bad encoding:{}".format(
                                words))
                            continue
                        else:
                            words = words.encode('utf8').decode('utf8')
                    if remove_punctuation:
                        for p in punctuation:
                            words = words.replace(p, '')
                    words = words.replace('\n', '')  # trailing new lines are
                    # misread by csv writer
                    utterance += words

                    if 'slot_name' in group.keys():  # this group is a slot
                        slot = group['slot_name'].lower()
                        if remove_punctuation:
                            for p in punctuation:
                                slot = slot.replace(p, '')

                        delexicalised += '_' + slot + '_'
                        for i, word in enumerate(word_tokenize(words)):
                            if i == 0:
                                word = 'B-' + slot + ' '
                            else:
                                word = 'I-' + slot + ' '
                            labelling += word

                    else:  # this group is just context
                        delexicalised += words
                        labelling += 'O ' * len(word_tokenize(words))

                csv_data.append([utterance, labelling, delexicalised, intent])

        output_file = out_folder / '{}.csv'.format(split)
        write_csv(csv_data, output_file)