def resplit_datasets(dataset, other_dataset, random_seed=None, cut=None): """ Deterministic shuffle and split algorithm. Given the same two datasets and the same `random_seed`, the split happens the same exact way every call. Args: dataset (torchnlp.datasets.Dataset) other_dataset (torchnlp.datasets.Dataset) random_seed (int, optional) cut (float, optional): float between 0 and 1 to cut the dataset; otherwise, the same proportions are kept. Returns: dataset (torchnlp.datasets.Dataset) other_dataset (torchnlp.datasets.Dataset) """ concat = dataset.rows + other_dataset.rows # Reference: # https://stackoverflow.com/questions/19306976/python-shuffling-with-a-parameter-to-get-the-same-result # NOTE: Shuffle the same way every call of `shuffle_datasets` where the `random_seed` is given random.Random(random_seed).shuffle(concat) if cut is None: return Dataset(concat[:len(dataset)]), Dataset(concat[len(dataset):]) else: cut = max(min(round(len(concat) * cut), len(concat)), 0) return Dataset(concat[:cut]), Dataset(concat[cut:])
def test_resplit_datasets(): a = Dataset([{'r': 1}, {'r': 2}, {'r': 3}, {'r': 4}, {'r': 5}]) b = Dataset([{'r': 6}, {'r': 7}, {'r': 8}, {'r': 9}, {'r': 10}]) # Test determinism a, b = resplit_datasets(a, b, random_seed=123) assert list(a) == [{'r': 9}, {'r': 8}, {'r': 6}, {'r': 10}, {'r': 3}] assert list(b) == [{'r': 4}, {'r': 7}, {'r': 2}, {'r': 5}, {'r': 1}]
def resplit_datasets(dataset, other_dataset, random_seed=None, split=None): """Deterministic shuffle and split algorithm. Given the same two datasets and the same ``random_seed``, the split happens the same exact way every call. Args: dataset (lib.datasets.Dataset): First dataset. other_dataset (lib.datasets.Dataset): Another dataset. random_seed (int, optional): Seed to control the shuffle of both datasets. split (float, optional): If defined it is the percentage of rows that first dataset gets after split otherwise the original proportions are kept. Returns: :class:`lib.datasets.Dataset`, :class:`lib.datasets.Dataset`: Resplit datasets. """ # Prevent circular dependency from torchnlp.datasets import Dataset concat = dataset.rows + other_dataset.rows shuffle(concat, random_seed=random_seed) if split is None: return Dataset(concat[:len(dataset)]), Dataset(concat[len(dataset):]) else: split = max(min(round(len(concat) * split), len(concat)), 0) return Dataset(concat[:split]), Dataset(concat[split:])
def parse_dataset(path, label_to_idx, word_to_idx, pos_target=False, pad_len=None, encoding='latin-1', max_len=100): sentences = [] UNK = 3 PAD = 1 target_index = 1 if pos_target else 3 nr_long = 0 max_sus = 0 with open(path, encoding=encoding) as f: sample = {'word_ids': [], 'labels': []} max_len_token = 0 for line in f.read().splitlines(): if line in ['\n', '\r\n', '']: # end of sequence if len(sample['labels']) > 100: nr_long += 1 if (len(sample['labels']) > 0) and (len(sample['word_ids']) < max_len): max_sus = max(max_sus, len(sample['word_ids'])) sample['labels'] = torch.LongTensor(sample['labels']) sentences.append(sample) sample = {'word_ids': [], 'labels': []} continue else: ls = line.split() max_len_token = max(max_len_token, len(ls[4:])) word = ls[4:] label = ls[target_index] if len(word) > 0: word_ids = [ word_to_idx[w] if w in word_to_idx.keys() else UNK for w in word ] sample['word_ids'].append( torch.LongTensor(word_ids)) # 3 -> <unk> sample['labels'].append(label_to_idx[label]) if len(word_ids) > 20: print(line) # padd all BPE encodings to max length in dataset if pad_len is not None: max_len_token = max(pad_len, max_len_token) for s in range(len(sentences)): sen = sentences[s] for i in range(len(sen['word_ids'])): sen['word_ids'][i] = pad_tensor(sen['word_ids'][i], length=max_len_token, padding_index=PAD) # stack word ids back together sen['word_ids'] = torch.stack(sen['word_ids'], dim=0).view(-1) print('max nr of SUs in sentence: {}'.format(max_sus)) print('Number of long sentences: {}'.format(nr_long)) return Dataset(sentences), max_len_token
def test_dataset_set_row(): dataset = Dataset([{'a': 'a', 'b': 'b'}, {'a': 'aa', 'b': 'bb'}]) dataset[0] = {'c': 'c'} assert dataset['c'] == ['c', None] assert dataset['a'] == [None, 'aa'] dataset[0:2] = [{'d': 'd'}, {'d': 'dd'}] assert dataset[0] == {'d': 'd'} assert dataset[1] == {'d': 'dd'} with pytest.raises(IndexError): dataset[2] = {'c': 'c'}
def process_dataset( docs, label_to_idx, word_to_idx=None, word_counter=None, unk="<UNK>", pad="<PAD>", pad_idx=0, unk_idx=1, min_freq_word=50, label_value=1.0, binary_class=True, ): """" Process list of docs into Pytorch-ready dataset """ dset = [] tag_counter = Counter() stoi = None if min_freq_word: word_counter = Counter( [w for doc in docs for sent in doc.sentences for w in sent]) if word_to_idx is None: word_to_idx = OrderedDict() word_to_idx[pad] = pad_idx word_to_idx[unk] = unk_idx elif min_freq_word: stoi = { k: v for k, v in word_to_idx.items() if (word_counter[k] >= min_freq_word) or (k in [pad, unk]) } print("Loading and converting docs to PyTorch backend...") for doc in docs: sample, tag_counter = doc_to_sample( doc, label_to_idx, word_to_idx, word_counter, stoi=stoi, min_freq_word=min_freq_word, unk=unk, tag_counter=tag_counter, label_value=label_value, binary_class=binary_class, ) dset.append(sample) return Dataset(dset), word_to_idx, tag_counter
def test_dataset_set_column(): dataset = Dataset([{'a': 'a', 'b': 'b'}, {'a': 'aa', 'b': 'bb'}]) # Regular column update dataset['a'] = ['aa', 'aaa'] assert dataset['a'] == ['aa', 'aaa'] # To Little dataset['b'] = ['b'] assert dataset['b'] == ['b', None] # Too many dataset['c'] = ['c', 'cc', 'ccc'] assert dataset['c'] == ['c', 'cc', 'ccc'] # Smoke (regression test) random.shuffle(dataset)
def parse_dataset_laser(path, label_to_idx, word_to_idx, pos_target=False, encoding='latin-1', max_len=100): sentences = [] UNK = 3 PAD = 1 target_index = 1 if pos_target else 3 with open(path, encoding=encoding) as f: sample = {'word_ids': [], 'labels': [], 'word_len': []} max_len_token = 0 for line in f.read().splitlines(): if line in ['\n', '\r\n', '']: # end of sequence if (len(sample['labels']) > 0) and (len(sample['word_ids']) < max_len): sample['labels'] = torch.LongTensor(sample['labels']) sample['word_ids'] = torch.LongTensor(sample['word_ids']) sample['word_len'] = torch.LongTensor(sample['word_len']) sentences.append(sample) sample = {'word_ids': [], 'labels': [], 'word_len': []} continue else: ls = line.split() max_len_token = max(max_len_token, len(ls[4:])) word = ls[4:] label = ls[target_index] if len(word) > 0: word_ids = [ word_to_idx[w.lower()] if w.lower() in word_to_idx.keys() else UNK for w in word ] sample['word_ids'].extend(word_ids) # 3 -> <unk> sample['word_len'].append(len(word_ids)) sample['labels'].append(label_to_idx[label]) if len(word_ids) > 20: print(line) return Dataset(sentences), max_len_token
def parse_dataset_muse(path, label_to_idx, word_to_idx=None, pos_target=False, encoding='utf-8', max_len=150): target_index = 1 if pos_target else 3 sentences = [] if word_to_idx is None: word_to_idx = OrderedDict() word_num = 0 else: word_num = len(word_to_idx) with open(path, encoding=encoding) as f: sample = {'word_ids': [], 'labels': []} for line in f.read().splitlines(): if line in ['\n', '\r\n', '']: # end of sequence if len(sample['labels']) > 0 and (len(sample['word_ids']) < max_len): sample['word_ids'] = torch.LongTensor(sample['word_ids']) sample['labels'] = torch.LongTensor(sample['labels']) sentences.append(sample) sample = {'word_ids': [], 'labels': []} continue else: ls = line.split() word = ls[0].lower() label = ls[target_index] if word not in word_to_idx.keys(): word_to_idx[word] = word_num word_num += 1 sample['word_ids'].append(word_to_idx[word]) sample['labels'].append(label_to_idx[label]) return Dataset(sentences), word_to_idx
def random_dataset(input_key='input', output_key='output', input_generator=random_sequence, output_generator=random_sequence, input_encoder=WhitespaceEncoder, output_encoder=WhitespaceEncoder, size=random.randint(1, 100)): """ Returns: (torchnlp.datasets.Dataset) dataset over random data """ rows = [] for _ in range(size): row = {} row[input_key] = input_generator() row[output_key] = output_generator() rows.append(row) dataset = Dataset(rows) input_encoder = input_encoder(dataset[input_key]) output_encoder = output_encoder(dataset[output_key]) for row in dataset: row[input_key] = input_encoder.encode(row[input_key]) row[output_key] = output_encoder.encode(row[output_key]) return dataset, input_encoder, output_encoder
def test_resplit_datasets_cut(): a = Dataset([{'r': 1}, {'r': 2}, {'r': 3}, {'r': 4}, {'r': 5}]) b = Dataset([{'r': 6}, {'r': 7}, {'r': 8}, {'r': 9}, {'r': 10}]) a, b = resplit_datasets(a, b, random_seed=123, split=0.3) assert len(a) == 3 assert len(b) == 7
def test_dataset_init(): dataset = Dataset([{'a': 'a', 'b': 'b'}, {'a': 'aa', 'b': 'bb'}]) assert len(dataset) == 2 assert 'a' in dataset assert 'b' in dataset assert 'c' not in dataset
def test_dataset_concat(): dataset = Dataset([{'a': 'a', 'b': 'b'}, {'a': 'aa', 'b': 'bb'}]) other_dataset = Dataset([{'a': 'a', 'b': 'b'}, {'a': 'aa', 'b': 'bb'}]) concat = dataset + other_dataset assert len(concat) == 4 assert list(concat) == dataset.rows + other_dataset.rows
def test_dataset_equality(): dataset = Dataset([{'a': 'a', 'b': 'b'}, {'a': 'aa', 'b': 'bb'}]) other_dataset = Dataset([{'a': 'a', 'b': 'b'}, {'a': 'aa', 'b': 'bb'}]) assert dataset == other_dataset
def test_dataset_get_row(): dataset = Dataset([{'a': 'a', 'b': 'b'}, {'a': 'aa', 'b': 'bb'}]) assert dataset[0] == {'a': 'a', 'b': 'b'} assert dataset[1] == {'a': 'aa', 'b': 'bb'} with pytest.raises(IndexError): dataset[2]
def test_dataset_get_column(): dataset = Dataset([{'a': 'a', 'b': 'b'}, {'a': 'aa', 'b': 'bb'}]) assert dataset['a'] == ['a', 'aa'] assert dataset['b'] == ['b', 'bb'] with pytest.raises(AttributeError): dataset['c']
def test_dataset_str(): dataset = Dataset([{'a': 'a', 'b': 'b'}, {'a': 'aa', 'b': 'bb'}]) assert ' a b\n0 a b\n1 aa bb' == str(dataset)