def make_data(path_root='../data/ag_news_csv.tgz', ngrams=2, vocab=None, include_unk=False): extracted_files = extract_archive(path_root) for fname in extracted_files: if fname.endswith('train.csv'): train_csv_path = fname if fname.endswith('test.csv'): test_csv_path = fname vocab = build_vocab_from_iterator(_csv_iterator(train_csv_path, ngrams)) logging.info('Vocab has {} entries'.format(len(vocab))) logging.info('Creating training data') train_data, train_labels = _create_data_from_iterator( vocab, _csv_iterator(train_csv_path, ngrams, yield_cls=True), include_unk) logging.info('Creating testing data') test_data, test_labels = _create_data_from_iterator( vocab, _csv_iterator(test_csv_path, ngrams, yield_cls=True), include_unk) if len(train_labels ^ test_labels) > 0: raise ValueError("Training and test labels don't match") return (TextClassificationDataset(vocab, train_data, train_labels), TextClassificationDataset(vocab, test_data, test_labels))
def _setup_datasets(root='.data', ngrams=1, vocab=None, include_unk=False): file_list = os.listdir(root) for fname in file_list: if fname.endswith('DSL-TRAIN.txt'): train_csv_path = os.path.join(root, fname) if fname.endswith('DSL-TEST-GOLD.txt'): test_csv_path = os.path.join(root, fname) if vocab is None: logging.info('Building Vocab based on {}'.format(train_csv_path)) vocab = build_vocab_from_iterator(_csv_iterator( train_csv_path, ngrams)) else: if not isinstance(vocab, Vocab): raise TypeError("Passed vocabulary is not of type Vocab") logging.info('Vocab has {} entries'.format(len(vocab))) logging.info('Creating training data') train_data, train_labels = _create_data_from_iterator( vocab, _csv_iterator(train_csv_path, ngrams, yield_cls=True), include_unk) logging.info('Creating testing data') test_data, test_labels = _create_data_from_iterator( vocab, _csv_iterator(test_csv_path, ngrams, yield_cls=True), include_unk) if len(train_labels ^ test_labels) > 0: raise ValueError("Training and test labels don't match") return (TextClassificationDataset(vocab, train_data, train_labels), TextClassificationDataset(vocab, test_data, test_labels))
def _setup_datasets(dataset_name, root='.data', ngrams=1, vocab=None, include_unk=False): #dataset_tar = download_from_url(URLS[dataset_name], root=root) extracted_files = extract_archive('.data/ag_news_csv.tar.gz') for fname in extracted_files: if fname.endswith('train.csv'): train_csv_path = fname if fname.endswith('test.csv'): test_csv_path = fname if vocab is None: print('Building Vocab based on {}'.format(train_csv_path)) vocab = build_vocab_from_iterator(_csv_iterator( train_csv_path, ngrams)) else: if not isinstance(vocab, Vocab): raise TypeError("Passed vocabulary is not of type Vocab") print('Vocab has {} entries'.format(len(vocab))) print('Creating training data') train_data, train_labels = _create_data_from_iterator( vocab, _csv_iterator(train_csv_path, ngrams, yield_cls=True), include_unk) print('Creating testing data') test_data, test_labels = _create_data_from_iterator( vocab, _csv_iterator(test_csv_path, ngrams, yield_cls=True), include_unk) if len(train_labels ^ test_labels) > 0: raise ValueError("Training and test labels don't match") return (TextClassificationDataset(vocab, train_data, train_labels), TextClassificationDataset(vocab, test_data, test_labels), vocab)
def setup_datasets(train_csv_path, test_csv_path, include_unk=False): iterator=_csv_iterator(train_csv_path, NGRAMS) vocab = build_vocab_from_iterator(iterator) train_data, train_labels = _create_data_from_iterator(vocab, _csv_iterator(train_csv_path, NGRAMS, yield_cls=True, label=0), include_unk) test_data, test_labels = _create_data_from_iterator(vocab, _csv_iterator(test_csv_path, NGRAMS, yield_cls=True, label=0), include_unk) return TextClassificationDataset(vocab, train_data, train_labels), TextClassificationDataset(vocab, test_data, test_labels)
def _setup_datasets(dataset_name, root='.data', ngrams=1, vocab=None, include_unk=False): ################################################################################## # 由于torchtext实验数据集的 原有逻辑 会去下载 国外的墙外数据集, # 所以在自行下载实验数据集后,调整了已下载数据集的加载逻辑。 # 已下载数据集AG_NEWS 和 SogouNews,数据集存放地址: ~/data/torch data_dir = os.path.join(os.path.expanduser('~'), "data/torch") dataset_file_name = { 'AG_NEWS': os.path.join(data_dir, "ag_news_csv.tar.gz"), 'SogouNews': os.path.join(data_dir, "sogou_news_csv.tar.gz"), } extracted_files = [] if dataset_name in dataset_file_name.keys(): extracted_files = extract_archive(dataset_file_name[dataset_name]) pass else: dataset_tar = download_from_url(URLS[dataset_name], root=root) extracted_files = extract_archive(dataset_tar) train_csv_path = "" test_csv_path = "" for fname in extracted_files: if fname.endswith('train.csv'): train_csv_path = fname if fname.endswith('test.csv'): test_csv_path = fname ################################################################################## # 数据文件加载 # 构建词汇表 logger.info("Creating Vocab") if vocab is None: logging.info('Building Vocab based on {}'.format(train_csv_path)) vocab = build_vocab_from_iterator(csv_iterator(train_csv_path, ngrams)) else: if not isinstance(vocab, Vocab): raise TypeError("Passed vocabulary is not of type Vocab") logging.info('Vocab has {} entries'.format(len(vocab))) # 构建训练数据集 logging.info('Creating training data') train_data, train_labels = create_data_from_iterator( vocab, csv_iterator(train_csv_path, ngrams, yield_cls=True), include_unk) logger.info(f"training data size: {len(train_data)}") # 构建猜测是数据集 logging.info('Creating testing data') test_data, test_labels = create_data_from_iterator( vocab, csv_iterator(test_csv_path, ngrams, yield_cls=True), include_unk) logger.info(f"testing data size: {len(test_data)}") if len(train_labels ^ test_labels) > 0: raise ValueError("Training and test labels don't match") return (TextClassificationDataset(vocab, train_data, train_labels), TextClassificationDataset(vocab, test_data, test_labels))
def loadData(train_csv_path, test_csv_path, ngrams): vocab = build_vocab_from_iterator(_csv_iterator(train_csv_path, ngrams)) train_data, train_labels = _create_data_from_iterator( vocab, _csv_iterator(train_csv_path, ngrams, yield_cls=True), False) test_data, test_labels = _create_data_from_iterator( vocab, _csv_iterator(test_csv_path, ngrams, yield_cls=True), False) return (TextClassificationDataset(vocab, train_data, train_labels), TextClassificationDataset(vocab, test_data, test_labels))
def _setup_datasets(data_to_parse, ngrams=1, vocab=None, include_unk=False): """ Uses given data to create Torch Text Dataset :param data_to_parse: :param ngrams: :param vocab: :param include_unk: :return: """ if vocab is None: vocab = build_vocab_from_iterator(_pd_iterator(data_to_parse, ngrams)) else: if not isinstance(vocab, Vocab): raise TypeError("Passed vocabulary is not of type Vocab") train_data, train_labels = _create_data_from_iterator( vocab, _pd_iterator(data_to_parse, ngrams, yield_cls=True), include_unk) return TextClassificationDataset(vocab, train_data, train_labels)
def setup_datasets_testing_from_df(self, vocab, df, include_unk=False): test_data, test_labels = self.create_iterator_from_file_for_testing( vocab, NGRAMS, df) return TextClassificationDataset(vocab, test_data, test_labels), vocab
def csv_dataset(csv_path, ngrams): vocab = build_vocab_from_iterator(_csv_iterator(csv_path, ngrams)) train_data, train_labels = _create_data_from_iterator( vocab, _csv_iterator(csv_path, ngrams, yield_cls=True), False) return TextClassificationDataset(vocab, train_data, train_labels)