def setup_datasets(dataset_name, root='.data', vocab_size=20000, include_unk=False): dataset_tar = download_from_url(URLS[dataset_name], root=root) extracted_files = extract_archive(dataset_tar) for fname in extracted_files: if fname.endswith('train.csv'): train_csv_path = fname if fname.endswith('test.csv'): test_csv_path = fname # generate sentencepiece pretrained tokenizer if not path.exists('m_user.model'): logging.info('Generate SentencePiece pretrained tokenizer...') generate_sp_model(train_csv_path, vocab_size) sp_model = load_sp_model("m_user.model") sp_generator = sentencepiece_numericalizer(sp_model) train_data, train_labels = _create_data_with_sp_transform( sp_generator, train_csv_path) test_data, test_labels = _create_data_with_sp_transform( sp_generator, test_csv_path) if len(train_labels ^ test_labels) > 0: raise ValueError("Training and test labels don't match") return (text_classification.TextClassificationDataset( None, train_data, train_labels), text_classification.TextClassificationDataset( None, test_data, test_labels))
def prepairData(path, ngrams=NGRAMS, vocab=None): if not os.path.isdir(path): logging.error('Data path err') return train_csv_path = path + 'train.csv' test_csv_path = path + 'test.csv' if vocab is None: logging.info('Building Vocab based on {}'.format(train_csv_path)) vocab = torch_text.build_vocab_from_iterator( torch_text._csv_iterator(train_csv_path, ngrams)) else: if not isinstance(vocab, Vocab): raise TypeError("Passed vocabulary is not of type Vocab") train_data, train_labels = torch_text._create_data_from_iterator( vocab, torch_text._csv_iterator(train_csv_path, ngrams, yield_cls=True), include_unk=False) logging.info('Creating testing data') test_data, test_labels = torch_text._create_data_from_iterator( vocab, torch_text._csv_iterator(test_csv_path, ngrams, yield_cls=True), include_unk=False) if len(train_labels ^ test_labels) > 0: raise ValueError("Training and test labels don't match") return (torch_text.TextClassificationDataset(vocab, train_data, train_labels), torch_text.TextClassificationDataset(vocab, test_data, test_labels))
def __init__(self, device, **kwargs): self.device = device self.batch_size = kwargs.get('batch_size') self.path_to_data = kwargs.pop('path_to_data') self.path_to_vectors = kwargs.pop('path_to_vectors') self.emb_dim = kwargs.pop('emb_dim') self.voc_size = kwargs.pop('voc_size') self.min_freq = kwargs.pop('min_freq', 1) self.fix_length = kwargs.pop('fix_len', 203) self.path_train_data = self.path_to_data + '/yelp_15/yelp.train.txt' self.path_val_data = self.path_to_data + '/yelp_15/yelp.valid.txt' self.path_test_data = self.path_to_data + '/yelp_15/yelp.test.txt' print("build vocab") vocab = self.build_vocab_from_textfile(self.path_train_data) print("create train split") list_train_data, list_train_labels = self.create_data_from_textfile( vocab, self.path_train_data, include_unk=True) train = text_classification.TextClassificationDataset( vocab, list_train_data, list_train_labels) print("create val split") list_val_data, list_val_labels = self.create_data_from_textfile( vocab, self.path_val_data, include_unk=True) valid = text_classification.TextClassificationDataset( vocab, list_val_data, list_val_labels) print("create test split") list_test_data, list_test_labels = self.create_data_from_textfile( vocab, self.path_test_data, include_unk=True) test = text_classification.TextClassificationDataset( vocab, list_test_data, list_test_labels) print("create data loaders") self._train_iter = DataLoader( train, batch_size=self.batch_size, shuffle=True, collate_fn=self.generate_batch, ) self._valid_iter = DataLoader(valid, batch_size=self.batch_size, shuffle=True, collate_fn=self.generate_batch) self._test_iter = DataLoader(test, batch_size=self.batch_size, shuffle=True, collate_fn=self.generate_batch) self.train_vocab = vocab
def _setup_datasets(self, dataset_name, root='./data', ngrams=1, vocab=None, include_unk=True, download=False): if download: dataset_tar = download_from_url(URLS[dataset_name], root=root) extracted_files = extract_archive(dataset_tar) for fname in extracted_files: if fname.endswith('train.csv'): train_csv_path = fname if fname.endswith('test.csv'): test_csv_path = fname else: dir_name = root + "/" + dataset_name + "/" train_csv_path = dir_name + "train.csv" test_csv_path = dir_name + "test.csv" if vocab is None: print('Building Vocab based on {}'.format(train_csv_path)) vocab = self.build_vocab_from_iterator( text_classification._csv_iterator(train_csv_path, ngrams)) else: if not isinstance(vocab, Vocab): raise TypeError("Passed vocabulary is not of type Vocab") print('Vocab has {} entries'.format(len(vocab))) print('Creating training data') train_data, train_labels = text_classification._create_data_from_iterator( vocab, text_classification._csv_iterator(train_csv_path, ngrams, yield_cls=True), include_unk) print('Creating testing data') test_data, test_labels = text_classification._create_data_from_iterator( vocab, text_classification._csv_iterator(test_csv_path, ngrams, yield_cls=True), include_unk) if len(train_labels ^ test_labels) > 0: raise ValueError("Training and test labels don't match") return (text_classification.TextClassificationDataset( vocab, train_data, train_labels), text_classification.TextClassificationDataset( vocab, test_data, test_labels))
def setup_datasets(dataset_name, root='.data', vocab_size=20000, include_unk=False): dataset_tar = download_from_url(URLS[dataset_name], root=root) extracted_files = extract_archive(dataset_tar) for fname in extracted_files: if fname.endswith('train.csv'): train_csv_path = fname if fname.endswith('test.csv'): test_csv_path = fname train_data, train_labels = _create_data_with_sp_transform(train_csv_path) test_data, test_labels = _create_data_with_sp_transform(test_csv_path) if len(train_labels ^ test_labels) > 0: raise ValueError("Training and test labels don't match") return (text_classification.TextClassificationDataset( None, train_data, train_labels), text_classification.TextClassificationDataset( None, test_data, test_labels))
def setup_datasets(dataset_name, root='.data', vocab_size=60000, include_unk=False): train_csv_path = './.data/hackson/train.csv' test_csv_path = './.data/hackson/test_withsomelabels.csv' from torchtext.vocab import build_vocab_from_iterator vocab = build_vocab_from_iterator(_csv_iterator(train_csv_path)) #pdb.set_trace() train_data, train_labels = _create_data_with_sp_transform( train_csv_path, vocab) test_data, test_labels = _create_data_with_sp_transform( test_csv_path, vocab) #print (train_data) if len(train_labels ^ test_labels) > 0: raise ValueError("Training and test labels don't match") return (text_classification.TextClassificationDataset( None, train_data, train_labels), text_classification.TextClassificationDataset( None, test_data, test_labels))