def read_dataset(): dataset = sets.Ocr() dataset = sets.OneHot(dataset.target, depth=2)(dataset, columns=['target']) dataset['data'] = dataset.data.reshape(dataset.data.shape[:-2] + (-1, )).astype(float) train, test = sets.Split(0.66)(dataset) return train, test
def get_dataset(): """Read dataset and flatten images.""" dataset = sets.Ocr() dataset = sets.OneHot(dataset.target, depth=2)(dataset, columns=['target']) dataset['data'] = dataset.data.reshape(dataset.data.shape[:-2] + (-1, )).astype(float) train, test = sets.Split(0.66)(dataset) return train, test
def test_embedding_found(): data = list('ceabb') target = list('abddd') vocabulary = list('abc') dataset = sets.Dataset(data=data, target=target) dataset, found = sets.OneHot(vocabulary)( dataset, columns=['data', 'target'], return_found=True) assert found == 6 / 10
def test_semeval(): dataset = sets.SemEvalRelation() dataset = sets.Tokenize()(dataset) dataset = sets.OneHot(dataset.target)(dataset, columns=['target']) dataset = sets.WordDistance('<e1>', '<e2>', depth=2)( dataset, column='data') dataset = sets.Glove(100, depth=2)(dataset, columns=['data']) dataset = sets.Concat(2, 'data')( dataset, columns=('data', 'word_distance'))
def test_onehot(dataset): result = sets.OneHot(dataset.target)(dataset, columns=['target']) assert result.target.shape[1] == len(np.unique(dataset.target)) assert (result.target.sum(axis=1)).all() assert (result.target.max(axis=1)).all()
def test_ocr(): dataset = sets.Ocr() dataset = sets.OneHot(dataset.target, depth=2)(dataset, columns=['target'])