コード例 #1
0
ファイル: helper.py プロジェクト: zjjhit/PycharmDL
def make_data(path_root='../data/ag_news_csv.tgz',
              ngrams=2,
              vocab=None,
              include_unk=False):
    extracted_files = extract_archive(path_root)

    for fname in extracted_files:
        if fname.endswith('train.csv'):
            train_csv_path = fname
        if fname.endswith('test.csv'):
            test_csv_path = fname

    vocab = build_vocab_from_iterator(_csv_iterator(train_csv_path, ngrams))

    logging.info('Vocab has {} entries'.format(len(vocab)))
    logging.info('Creating training data')

    train_data, train_labels = _create_data_from_iterator(
        vocab, _csv_iterator(train_csv_path, ngrams, yield_cls=True),
        include_unk)
    logging.info('Creating testing data')
    test_data, test_labels = _create_data_from_iterator(
        vocab, _csv_iterator(test_csv_path, ngrams, yield_cls=True),
        include_unk)
    if len(train_labels ^ test_labels) > 0:
        raise ValueError("Training and test labels don't match")
    return (TextClassificationDataset(vocab, train_data, train_labels),
            TextClassificationDataset(vocab, test_data, test_labels))
コード例 #2
0
ファイル: main.py プロジェクト: stasstie/tu-bewerbung
def _setup_datasets(root='.data', ngrams=1, vocab=None, include_unk=False):
    file_list = os.listdir(root)

    for fname in file_list:
        if fname.endswith('DSL-TRAIN.txt'):
            train_csv_path = os.path.join(root, fname)
        if fname.endswith('DSL-TEST-GOLD.txt'):
            test_csv_path = os.path.join(root, fname)

    if vocab is None:
        logging.info('Building Vocab based on {}'.format(train_csv_path))
        vocab = build_vocab_from_iterator(_csv_iterator(
            train_csv_path, ngrams))
    else:
        if not isinstance(vocab, Vocab):
            raise TypeError("Passed vocabulary is not of type Vocab")
    logging.info('Vocab has {} entries'.format(len(vocab)))
    logging.info('Creating training data')
    train_data, train_labels = _create_data_from_iterator(
        vocab, _csv_iterator(train_csv_path, ngrams, yield_cls=True),
        include_unk)
    logging.info('Creating testing data')
    test_data, test_labels = _create_data_from_iterator(
        vocab, _csv_iterator(test_csv_path, ngrams, yield_cls=True),
        include_unk)
    if len(train_labels ^ test_labels) > 0:
        raise ValueError("Training and test labels don't match")
    return (TextClassificationDataset(vocab, train_data, train_labels),
            TextClassificationDataset(vocab, test_data, test_labels))
コード例 #3
0
def _setup_datasets(dataset_name,
                    root='.data',
                    ngrams=1,
                    vocab=None,
                    include_unk=False):
    #dataset_tar = download_from_url(URLS[dataset_name], root=root)
    extracted_files = extract_archive('.data/ag_news_csv.tar.gz')

    for fname in extracted_files:
        if fname.endswith('train.csv'):
            train_csv_path = fname
        if fname.endswith('test.csv'):
            test_csv_path = fname

    if vocab is None:
        print('Building Vocab based on {}'.format(train_csv_path))
        vocab = build_vocab_from_iterator(_csv_iterator(
            train_csv_path, ngrams))
    else:
        if not isinstance(vocab, Vocab):
            raise TypeError("Passed vocabulary is not of type Vocab")
    print('Vocab has {} entries'.format(len(vocab)))
    print('Creating training data')
    train_data, train_labels = _create_data_from_iterator(
        vocab, _csv_iterator(train_csv_path, ngrams, yield_cls=True),
        include_unk)
    print('Creating testing data')
    test_data, test_labels = _create_data_from_iterator(
        vocab, _csv_iterator(test_csv_path, ngrams, yield_cls=True),
        include_unk)
    if len(train_labels ^ test_labels) > 0:
        raise ValueError("Training and test labels don't match")
    return (TextClassificationDataset(vocab, train_data, train_labels),
            TextClassificationDataset(vocab, test_data, test_labels), vocab)
コード例 #4
0
def setup_datasets(train_csv_path, test_csv_path, include_unk=False):
    iterator=_csv_iterator(train_csv_path, NGRAMS)
    vocab = build_vocab_from_iterator(iterator)
    train_data, train_labels = _create_data_from_iterator(vocab, _csv_iterator(train_csv_path, NGRAMS, yield_cls=True, label=0), include_unk)
    test_data, test_labels = _create_data_from_iterator(vocab, _csv_iterator(test_csv_path, NGRAMS, yield_cls=True, label=0), include_unk)


    return TextClassificationDataset(vocab, train_data, train_labels), TextClassificationDataset(vocab, test_data, test_labels)
コード例 #5
0
ファイル: pytorch_text_2.py プロジェクト: george-sq/qiao-lab
def _setup_datasets(dataset_name,
                    root='.data',
                    ngrams=1,
                    vocab=None,
                    include_unk=False):
    ##################################################################################
    # 由于torchtext实验数据集的 原有逻辑 会去下载 国外的墙外数据集,
    # 所以在自行下载实验数据集后,调整了已下载数据集的加载逻辑。
    # 已下载数据集AG_NEWS 和 SogouNews,数据集存放地址: ~/data/torch
    data_dir = os.path.join(os.path.expanduser('~'), "data/torch")
    dataset_file_name = {
        'AG_NEWS': os.path.join(data_dir, "ag_news_csv.tar.gz"),
        'SogouNews': os.path.join(data_dir, "sogou_news_csv.tar.gz"),
    }

    extracted_files = []
    if dataset_name in dataset_file_name.keys():
        extracted_files = extract_archive(dataset_file_name[dataset_name])
        pass
    else:
        dataset_tar = download_from_url(URLS[dataset_name], root=root)
        extracted_files = extract_archive(dataset_tar)

    train_csv_path = ""
    test_csv_path = ""
    for fname in extracted_files:
        if fname.endswith('train.csv'):
            train_csv_path = fname
        if fname.endswith('test.csv'):
            test_csv_path = fname
    ##################################################################################

    # 数据文件加载
    # 构建词汇表
    logger.info("Creating Vocab")
    if vocab is None:
        logging.info('Building Vocab based on {}'.format(train_csv_path))
        vocab = build_vocab_from_iterator(csv_iterator(train_csv_path, ngrams))
    else:
        if not isinstance(vocab, Vocab):
            raise TypeError("Passed vocabulary is not of type Vocab")
    logging.info('Vocab has {} entries'.format(len(vocab)))
    # 构建训练数据集
    logging.info('Creating training data')
    train_data, train_labels = create_data_from_iterator(
        vocab, csv_iterator(train_csv_path, ngrams, yield_cls=True),
        include_unk)
    logger.info(f"training data size: {len(train_data)}")
    # 构建猜测是数据集
    logging.info('Creating testing data')
    test_data, test_labels = create_data_from_iterator(
        vocab, csv_iterator(test_csv_path, ngrams, yield_cls=True),
        include_unk)
    logger.info(f"testing data size: {len(test_data)}")
    if len(train_labels ^ test_labels) > 0:
        raise ValueError("Training and test labels don't match")
    return (TextClassificationDataset(vocab, train_data, train_labels),
            TextClassificationDataset(vocab, test_data, test_labels))
コード例 #6
0
def loadData(train_csv_path, test_csv_path, ngrams):
    vocab = build_vocab_from_iterator(_csv_iterator(train_csv_path, ngrams))

    train_data, train_labels = _create_data_from_iterator(
        vocab, _csv_iterator(train_csv_path, ngrams, yield_cls=True), False)
    test_data, test_labels = _create_data_from_iterator(
        vocab, _csv_iterator(test_csv_path, ngrams, yield_cls=True), False)

    return (TextClassificationDataset(vocab, train_data, train_labels),
            TextClassificationDataset(vocab, test_data, test_labels))
コード例 #7
0
def _setup_datasets(data_to_parse, ngrams=1, vocab=None, include_unk=False):
    """
    Uses given data to create Torch Text Dataset
    :param data_to_parse:
    :param ngrams:
    :param vocab:
    :param include_unk:
    :return:
    """
    if vocab is None:
        vocab = build_vocab_from_iterator(_pd_iterator(data_to_parse, ngrams))
    else:
        if not isinstance(vocab, Vocab):
            raise TypeError("Passed vocabulary is not of type Vocab")

    train_data, train_labels = _create_data_from_iterator(
        vocab, _pd_iterator(data_to_parse, ngrams, yield_cls=True),
        include_unk)

    return TextClassificationDataset(vocab, train_data, train_labels)
コード例 #8
0
ファイル: final.py プロジェクト: CarelKuusk/MLproject
 def setup_datasets_testing_from_df(self, vocab, df, include_unk=False):
     test_data, test_labels = self.create_iterator_from_file_for_testing(
         vocab, NGRAMS, df)
     return TextClassificationDataset(vocab, test_data, test_labels), vocab
コード例 #9
0
def csv_dataset(csv_path, ngrams):
    vocab = build_vocab_from_iterator(_csv_iterator(csv_path, ngrams))
    train_data, train_labels = _create_data_from_iterator(
        vocab, _csv_iterator(csv_path, ngrams, yield_cls=True), False)
    return TextClassificationDataset(vocab, train_data, train_labels)