def test_dataset_split_explicit(): """ Dataset is split according to given indices """ split_parameter = [train_ind, val_ind, test_ind] st = SentenceTokenizer(vocab, 30) tokenized, _, _ = st.tokenize_sentences(sentences) result, result_dicts, added = st.split_train_val_test(sentences, dicts, split_parameter, extend_with=0) train = result[0] val = result[1] test = result[2] train_dicts = result_dicts[0] val_dicts = result_dicts[1] test_dicts = result_dicts[2] for i, sentence in enumerate(sentences): if i in train_ind: assert tokenized[i] in train assert dicts[i] in train_dicts elif i in val_ind: assert tokenized[i] in val assert dicts[i] in val_dicts elif i in test_ind: assert tokenized[i] in test assert dicts[i] in test_dicts assert len(train) == len(train_ind) assert len(val) == len(val_ind) assert len(test) == len(test_ind) assert len(train_dicts) == len(train_ind) assert len(val_dicts) == len(val_ind) assert len(test_dicts) == len(test_ind)
def test_dataset_split_parameter(): """ Dataset is split in the desired ratios """ split_parameter = [0.7, 0.1, 0.2] st = SentenceTokenizer(vocab, 30) result, result_dicts, _ = st.split_train_val_test(sentences, dicts, split_parameter, extend_with=0) train = result[0] val = result[1] test = result[2] train_dicts = result_dicts[0] val_dicts = result_dicts[1] test_dicts = result_dicts[2] assert len(train) == len(sentences) * split_parameter[0] assert len(val) == len(sentences) * split_parameter[1] assert len(test) == len(sentences) * split_parameter[2] assert len(train_dicts) == len(dicts) * split_parameter[0] assert len(val_dicts) == len(dicts) * split_parameter[1] assert len(test_dicts) == len(dicts) * split_parameter[2]
def load_benchmark(path, vocab, extend_with=0): """ Loads the given benchmark dataset. Tokenizes the texts using the provided vocabulary, extending it with words from the training dataset if extend_with > 0. Splits them into three lists: training, validation and testing (in that order). Also calculates the maximum length of the texts and the suggested batch_size. # Arguments: path: Path to the dataset to be loaded. vocab: Vocabulary to be used for tokenizing texts. extend_with: If > 0, the vocabulary will be extended with up to extend_with tokens from the training set before tokenizing. # Returns: A dictionary with the following fields: texts: List of three lists, containing tokenized inputs for training, validation and testing (in that order). labels: List of three lists, containing labels for training, validation and testing (in that order). added: Number of tokens added to the vocabulary. batch_size: Batch size. maxlen: Maximum length of an input. """ # Pre-processing dataset with open(path, 'rb') as dataset: data = dataset.readlines() # data = pickle.load(dataset, fix_imports=True) # Decode data try: texts = [str(x) for x in data['texts']] except UnicodeDecodeError: texts = [x.decode('utf-8') for x in data['texts']] # Extract labels labels = [x['label'] for x in data['info']] print('This is the labels', type(labels), labels) batch_size, maxlen = calculate_batchsize_maxlen(texts) st = SentenceTokenizer(vocab, maxlen) # Split up dataset. Extend the existing vocabulary with up to extend_with # tokens from the training dataset. texts, labels, added = st.split_train_val_test( texts, labels, [data['train_ind'], data['val_ind'], data['test_ind']], extend_with=extend_with) return { 'texts': texts, 'labels': labels, 'added': added, 'batch_size': batch_size, 'maxlen': maxlen }
def convert_dataset(filepath, extend_with, vocab): print('-- Generating {} '.format(filepath)) sys.stdout.flush() st = SentenceTokenizer(vocab, maxlen) tokenized, dicts, _ = st.split_train_val_test( texts, labels, [data['train_ind'], data['val_ind'], data['test_ind']], extend_with=extend_with) pick = format_pickle(dset, tokenized[0], tokenized[1], tokenized[2], dicts[0], dicts[1], dicts[2]) with open(filepath, 'w') as f: pickle.dump(pick, f) cover = coverage(tokenized[2]) print(' done. Coverage: {}'.format(cover))
def convert_dataset(filepath, extend_with, vocab): print('-- Generating {} '.format(filepath)) sys.stdout.flush() st = SentenceTokenizer(vocab, maxlen) tokenized, dicts, _ = st.split_train_val_test(texts, labels, [data['train_ind'], data['val_ind'], data['test_ind']], extend_with=extend_with) pick = format_pickle(dset, tokenized[0], tokenized[1], tokenized[2], dicts[0], dicts[1], dicts[2]) with open(filepath, 'w') as f: pickle.dump(pick, f) cover = coverage(tokenized[2]) print(' done. Coverage: {}'.format(cover))
def load_non_benchmark(data, vocab, extend_with=0): # Decode data try: texts = [x for x in data['texts']] except UnicodeDecodeError: texts = [x.decode('utf-8') for x in data['texts']] # Extract labels labels = [x['label'] for x in data['info']] batch_size, maxlen = calculate_batchsize_maxlen(texts) st = SentenceTokenizer(vocab, maxlen) # Split up dataset. Extend the existing vocabulary with up to extend_with # tokens from the training dataset. texts, labels, added = st.split_train_val_test(texts, labels, extend_with=extend_with) return {'texts': texts, 'labels': labels, 'added': added, 'batch_size': batch_size, 'maxlen': maxlen}
for p in DATASET_PATHS: coverage_result = [p] print('Calculating coverage for {}'.format(p)) with open(p) as f: s = pickle.load(f) # Decode data try: s['texts'] = [str(x) for x in s['texts']] except UnicodeDecodeError: s['texts'] = [x.decode('utf-8') for x in s['texts']] # Own st = SentenceTokenizer({}, 30) tests, dicts, _ = st.split_train_val_test( s['texts'], s['info'], [s['train_ind'], s['val_ind'], s['test_ind']], extend_with=10000) coverage_result.append(coverage(tests[2])) # Last st = SentenceTokenizer(vocab, 30) tests, dicts, _ = st.split_train_val_test( s['texts'], s['info'], [s['train_ind'], s['val_ind'], s['test_ind']], extend_with=0) coverage_result.append(coverage(tests[2])) # Full st = SentenceTokenizer(vocab, 30) tests, dicts, _ = st.split_train_val_test( s['texts'],
u'I am sentence 7', u'I am sentence 8', u'I am sentence 9 newword', ] INFO_DICTS = [ {'label': 'sentence 0'}, {'label': 'sentence 1'}, {'label': 'sentence 2'}, {'label': 'sentence 3'}, {'label': 'sentence 4'}, {'label': 'sentence 5'}, {'label': 'sentence 6'}, {'label': 'sentence 7'}, {'label': 'sentence 8'}, {'label': 'sentence 9'}, ] with open('../model/vocabulary.json', 'r') as f: vocab = json.load(f) st = SentenceTokenizer(vocab, 30) # Split using the default split ratio print(st.split_train_val_test(DATASET, INFO_DICTS)) # Split explicitly print(st.split_train_val_test(DATASET, INFO_DICTS, [[0, 1, 2, 4, 9], [5, 6], [7, 8, 3]], extend_with=1))
}, { 'label': 'sentence 5' }, { 'label': 'sentence 6' }, { 'label': 'sentence 7' }, { 'label': 'sentence 8' }, { 'label': 'sentence 9' }, ] with open('../model/vocabulary.json', 'r') as f: vocab = json.load(f) st = SentenceTokenizer(vocab, 30) # Split using the default split ratio print(st.split_train_val_test(DATASET, INFO_DICTS)) # Split explicitly print( st.split_train_val_test(DATASET, INFO_DICTS, [[0, 1, 2, 4, 9], [5, 6], [7, 8, 3]], extend_with=1))