def save_augmented_dataset(generated_sentences, n_generated, train_path, output_dir): dataset = read_csv(train_path) for s, l, d, i in zip(generated_sentences['utterances'], generated_sentences['labellings'], generated_sentences['delexicalised'], generated_sentences['intents']): dataset.append([s, l, d, i]) augmented_path = output_dir / Path( train_path.name.replace('.csv', '_aug_{}.csv'.format(n_generated))) write_csv(dataset, augmented_path)
def build_data_files(self, dataset_folder, dataset_size=None, restrict_intents=None, none_folder=None, none_size=None, none_intents=None, none_idx=None, infersent_selection="no_infersent_selection", cosine_threshold=None, output_folder=None, skip_header=True): original_train_path = dataset_folder / 'train.csv' original_test_path = dataset_folder / 'validate.csv' new_train = read_csv(original_train_path) new_test = read_csv(original_test_path) if skip_header: header_train = new_train[0] header_test = new_test[0] new_train = new_train[1:] new_test = new_test[1:] # filter intents filter_prefix = '' if restrict_intents is not None: filter_prefix = '_filtered' new_train = self.filter_intents(new_train, restrict_intents) new_test = self.filter_intents(new_test, restrict_intents) # trim_dataset trim_prefix = '' if dataset_size is not None: trim_prefix = '_{}'.format(dataset_size) original_dataset_size = len(new_train) keep_fraction = dataset_size / original_dataset_size intents = self.get_intents(new_train) sss = StratifiedShuffleSplit(n_splits=1, test_size=1 - keep_fraction) keep_indices = list(sss.split(intents, intents))[0][0] new_train = [new_train[i] for i in keep_indices] # new_train = random.sample(new_train, dataset_size) # add nones train_none_prefix = '' test_none_prefix = '' if none_size is not None: train_none_prefix = '_none_{}'.format(none_size) test_none_prefix = '_with_none' pseudolabels = None if infersent_selection != NO_INFERSENT_SELECTION: assert (none_intents is None) none_intents, pseudolabels = self.select_none_intents( new_train, restrict_intents, none_folder, cosine_threshold) if infersent_selection == 'unsupervised': pseudolabels = None # ignore pseudolabels new_train = self.add_nones(new_train, none_folder, none_size=none_size, none_intents=none_intents, pseudolabels=pseudolabels, none_idx=none_idx) new_test = self.add_nones(new_test, none_folder, none_size=200, none_intents=none_intents, none_idx=none_idx) if output_folder is not None: new_train_path = output_folder / 'train{}{}{}.csv'.format( trim_prefix, train_none_prefix, filter_prefix) new_test_path = output_folder / 'validate{}{}.csv'.format( test_none_prefix, filter_prefix) else: new_train_path = dataset_folder / 'train{}{}{}.csv'.format( trim_prefix, train_none_prefix, filter_prefix) new_test_path = dataset_folder / 'validate{}{}.csv'.format( test_none_prefix, filter_prefix) if skip_header: new_train = [header_train] + new_train new_test = [header_test] + new_test write_csv(new_test, new_test_path) write_csv(new_train, new_train_path) return new_train_path, new_test_path
from automatic_data_generation.utils.io import read_csv, write_csv from pathlib import Path from sklearn.model_selection import StratifiedShuffleSplit data = read_csv(Path('data.csv')) header = [data[0]] data = data[1:] intents = [row[3] for row in data] test_fraction = 0.2 test_size = int(test_fraction * len(data)) sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size) train_indices, test_indices = list(sss.split(intents, intents))[0] train = header + [data[i] for i in train_indices] validate = header + [data[i] for i in test_indices] write_csv(validate, Path('validate.csv')) write_csv(train, Path('train.csv'))
def json2csv(datadir, outdir, samples_per_intent): print('Starting json2csv conversion...') punctuation = [',', '.', ';', '?', '!', '\"'] data_folder = Path(datadir) out_folder = Path(outdir) for split in ['train', 'validate']: data_dict = {} for intent_dir in data_folder.iterdir(): if not intent_dir.is_dir(): continue intent = intent_dir.stem suffix = '{}_{}{}.json'.format( split, intent, '_full' if split == 'train' else '' ) data_dict[intent] = load_json(intent_dir / suffix, encoding='latin1')[intent] slotdic = {} csv_data = [['utterance', 'labels', 'delexicalised', 'intent']] for intent, data in data_dict.items(): for isent, sentence in enumerate(data): if isent >= samples_per_intent: break utterance = '' labelling = '' delexicalised = '' for group in sentence['data']: words = group['text'] try: words = words.encode('latin-1').decode('utf8') except (UnicodeDecodeError, UnicodeEncodeError): if 'entity' not in group.keys(): print("skipping because of bad encoding:{}".format( words)) continue else: words = words.encode('utf8').decode('utf8') if remove_punctuation: for p in punctuation: words = words.replace(p, '') words = words.replace('\n', '') # trailing new lines are # misread by csv writer utterance += words if 'entity' in group.keys(): # this group is a slot slot = group['entity'].lower() if remove_punctuation: for p in punctuation: slot = slot.replace(p, '') delexicalised += '_' + slot + '_' for i, word in enumerate(word_tokenize(words)): # if word == '': # continue if i == 0: word = 'B-' + slot + ' ' else: word = 'I-' + slot + ' ' labelling += word if slot not in slotdic.keys(): slotdic[slot] = [words] else: if words not in slotdic[slot]: slotdic[slot].append(words) else: # this group is just context delexicalised += words labelling += 'O ' * len(word_tokenize(words)) csv_data.append([utterance, labelling, delexicalised, intent]) output_file = out_folder / '{}.csv'.format(split) write_csv(csv_data, output_file) output_pickle_file = out_folder / '{}_slot_values.pkl'.format(split) with open(output_pickle_file, 'wb') as f: print(slotdic.keys()) pickle.dump(slotdic, f) print('Dumped slot dictionnary') print('Example : ') print('Original utterance : ', utterance) print('Labelled : ', labelling) print('Delexicalised : ', delexicalised) print('Successfully converted json2csv !')
def new_json2csv(datadir, outdir): data_folder = Path(datadir) out_folder = Path(outdir) data = load_json(data_folder / 'dataset.json', encoding='latin1') remove_punctuation = True punctuation = [',', '.', ';', '?', '!', '\"'] val_fraction = 0.2 for split in ['train', 'validate']: csv_data = [['utterance', 'labels', 'delexicalised', 'intent']] for intent in data['intents'].keys(): num_val_sentences = int( val_fraction * len(data['intents'][intent]['utterances'])) print(split, intent, num_val_sentences) if split == 'validate': sentences = data['intents'][intent]['utterances'][ :num_val_sentences] else: sentences = data['intents'][intent]['utterances'][ num_val_sentences:] print(len(sentences)) for sentence in sentences: utterance = '' labelling = '' delexicalised = '' for group in sentence['data']: words = group['text'] try: words = words.encode('latin-1').decode('utf8') except (UnicodeDecodeError, UnicodeEncodeError): if 'entity' not in group.keys(): print("skipping because of bad encoding:{}".format( words)) continue else: words = words.encode('utf8').decode('utf8') if remove_punctuation: for p in punctuation: words = words.replace(p, '') words = words.replace('\n', '') # trailing new lines are # misread by csv writer utterance += words if 'slot_name' in group.keys(): # this group is a slot slot = group['slot_name'].lower() if remove_punctuation: for p in punctuation: slot = slot.replace(p, '') delexicalised += '_' + slot + '_' for i, word in enumerate(word_tokenize(words)): if i == 0: word = 'B-' + slot + ' ' else: word = 'I-' + slot + ' ' labelling += word else: # this group is just context delexicalised += words labelling += 'O ' * len(word_tokenize(words)) csv_data.append([utterance, labelling, delexicalised, intent]) output_file = out_folder / '{}.csv'.format(split) write_csv(csv_data, output_file)