def test_CRUD_dataset(capsys):
    datasets.create_dataset(
        service_account_json,
        api_key,
        project_id,
        cloud_region,
        dataset_id)

    datasets.get_dataset(
        service_account_json, api_key, project_id, cloud_region, dataset_id)

    datasets.list_datasets(
        service_account_json, api_key, project_id, cloud_region)

    # Test and also clean up
    datasets.delete_dataset(
        service_account_json, api_key, project_id, cloud_region, dataset_id)

    out, _ = capsys.readouterr()

    # Check that create/get/list/delete worked
    assert 'Created dataset' in out
    assert 'Time zone' in out
    assert 'Dataset' in out
    assert 'Deleted dataset' in out
def test_CRUD_dataset(capsys, crud_dataset_id):
    datasets.create_dataset(
        project_id,
        cloud_region,
        crud_dataset_id)

    datasets.get_dataset(
        project_id, cloud_region, crud_dataset_id)

    datasets.list_datasets(
        project_id, cloud_region)

    datasets.delete_dataset(
        project_id, cloud_region, crud_dataset_id)

    out, _ = capsys.readouterr()

    # Check that create/get/list/delete worked
    assert 'Created dataset' in out
    assert 'Time zone' in out
    assert 'Dataset' in out
    assert 'Deleted dataset' in out
Example #3
0
def test_CRUD_dataset(capsys, crud_dataset_id):
    datasets.create_dataset(project_id, cloud_region, crud_dataset_id)

    @retry(wait_exponential_multiplier=1000,
           wait_exponential_max=10000,
           stop_max_attempt_number=10,
           retry_on_exception=retry_if_server_exception)
    def get_dataset():
        datasets.get_dataset(project_id, cloud_region, crud_dataset_id)

    get_dataset()

    datasets.list_datasets(project_id, cloud_region)

    datasets.delete_dataset(project_id, cloud_region, crud_dataset_id)

    out, _ = capsys.readouterr()

    # Check that create/get/list/delete worked
    assert 'Created dataset' in out
    assert 'Time zone' in out
    assert 'Dataset' in out
    assert 'Deleted dataset' in out
Example #4
0
def test_CRUD_dataset(capsys):
    datasets.create_dataset(
        service_account_json,
        project_id,
        cloud_region,
        dataset_id)

    datasets.get_dataset(
        service_account_json, project_id, cloud_region, dataset_id)

    datasets.list_datasets(
        service_account_json, project_id, cloud_region)

    # Test and also clean up
    datasets.delete_dataset(
        service_account_json, project_id, cloud_region, dataset_id)

    out, _ = capsys.readouterr()

    # Check that create/get/list/delete worked
    assert 'Created dataset' in out
    assert 'Time zone' in out
    assert 'Dataset' in out
    assert 'Deleted dataset' in out
Example #5
0
    def setup(self, stage):
        logger.info("Loading raw data...")

        if self.name in list_datasets():
            logger.info("Loading HuggingFace dataset...")
            self.dataset_setup_fn = hugging_face_load_dataset
        else:
            logger.info("Loading local dataset...")
            self.dataset_setup_fn = file_load_dataset
            if not os.path.isfile(self.name):
                raise FileNotFoundError(
                    f"Passed in path `{self.name}` for dataset, but no such file found."
                )
        if stage == 'train':
            self.train = self.dataset_setup_fn(self.name, split="train")
            self.val = self.dataset_setup_fn(self.name, split="valid")
        elif stage == 'test':
            # DSYITF - don't  shoot yourself in the foot. Comment this out when doing pre-prod testing.
            self.val = self.dataset_setup_fn(self.name, split="valid")
            # self.test = self.dataset_setup_fn(self.name, split="test")
        else:
            raise NotImplementedError()
Example #6
0
class Dataset:
    '''
	Loads datasets and tokenizes them
	'''

    HF_DATASETS = list_datasets()
    DATA_PATH = '../data/'

    TRAIN_STR = 'train'
    TEST_STR = 'test'
    VALIDATION_STR = 'validation'

    def __init__(self, name, split):
        '''
		Initialzes dataset
		:param name: name of dataset
		:param split: train/validation/test split
		'''
        self.name = name
        self.split = split
        if self.name not in self.HF_DATASETS:
            self.type = 'csv'
        else:
            self.type = 'hf'

        self.data = self.get_dataset()

    def get_num_classes(self, label_column='label'):
        '''
		Fetches number of classes in dataset
		:return: number of classes in dataset
		'''
        return self.data.features[label_column].num_classes

    def get_dataset(self):
        '''
		Loads dataset from Huggingface repository
		'''
        if self.type == 'hf':
            if self.split == self.VALIDATION_STR:
                try:
                    return load_dataset(self.name, split=self.VALIDATION_STR)
                except ValueError:
                    pass
                try:
                    return load_dataset(self.name, split=self.TEST_STR)
                except ValueError:
                    raise RuntimeError(
                        'Invalid dataset. No validation set found.')
            else:
                return load_dataset(self.name, split=self.split)
        else:
            filename = os.path.join(self.DATA_PATH, self.name,
                                    str(self.split) + '.' + str(self.type))
            return load_dataset(self.type, data_files=filename)

    def student_dataset_encoder(self,
                                soft_labels,
                                batch_size,
                                text_column='text',
                                label_column='label'):
        '''
		Creates student dataset in tf.Dataset format along with student model encoder
		:param soft_labels: soft labels from teacher model
		:param batch_size: batch_size
		:param text_column: column name for text in dataset
		:param label_column: column name for label in dataset
		:return: student dataset and student model encoder
		'''
        dataset = copy.deepcopy(self.data)
        dataset.set_format(type='tensorflow', columns=[text_column])
        features = dataset[text_column]
        hard_labels = tf.keras.utils.to_categorical(
            dataset[label_column],
            num_classes=self.get_num_classes(label_column=label_column))
        labels = {'soft': soft_labels, 'hard': hard_labels}
        tfdataset = tf.data.Dataset.from_tensor_slices(
            (features, labels)).shuffle(self.data.num_rows).batch(batch_size)
        VOCAB_SIZE = 30522
        encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
            max_tokens=VOCAB_SIZE)
        encoder.adapt(tfdataset.map(lambda text, label: text))

        return tfdataset, encoder

    def classification_tokenize(self,
                                tokenizer,
                                batch_size,
                                max_seq_len,
                                model_name,
                                text_column='text',
                                label_column='label'):
        '''
		Tokenizes data for classification task
		:param tokenizer: tokenizer class
		:param batch_size: batch_size
		:param max_seq_len: maximum sequence length
		:param model_name: model name
		:return: tokenized data
		'''
        def encode(example):
            return tokenizer(example[text_column],
                             padding='max_length',
                             truncation=True)

        dataset = self.data.map(encode)
        dataset.set_format(type='tensorflow',
                           columns=Model.MODEL_INPUTS[model_name] +
                           [label_column])
        features = {
            x: dataset[x].to_tensor(default_value=0, shape=(None, max_seq_len))
            for x in Model.MODEL_INPUTS[model_name]
        }
        labels = tf.keras.utils.to_categorical(
            dataset[label_column],
            num_classes=self.get_num_classes(label_column=label_column))
        tfdataset = tf.data.Dataset.from_tensor_slices(
            (features, labels)).shuffle(self.data.num_rows).batch(batch_size)
        return tfdataset
Example #7
0
    def list_datasets(cls) -> List[str]:
        """List datasets on Huggingface datasets.

        Returns: list of datasets
        """
        return datasets.list_datasets()
Example #8
0
def get_dataset_list():
    return datasets.list_datasets()
Example #9
0
    st.error("Unable to load the templates file!\n\n"
             "We expect the file templates.yaml to be in the working directory. "
             "You might need to restart the app in the root directory of the repo.")
    st.stop()


def save_data(message="Done!"):
    with open("./templates.yaml", 'w') as f:
        templates.write_to_file(f)
        st.success(message)


#
# Loads dataset information
#
dataset_list = datasets.list_datasets()

#
# Initializes state
#
session_state = get_session_state(example_index=0, dataset=dataset_list[0])

#
# Select a dataset
#
# TODO: Currently raises an error if you select a dataset that requires a
# TODO: configuration. Not clear how to query for these options.
dataset_key = st.sidebar.selectbox('Dataset', dataset_list, key='dataset_select',
                 help='Select the dataset to work on. Number in parens ' +
                      'is the number of prompts created.')
st.sidebar.write("HINT: Try ag_news or trec for examples.")
Example #10
0
        'f1': f1_score(test_y, pred, average='macro')
    }
    print('SVM1 -- ACC:', svm1_scores['accuracy'], 'F1:', svm1_scores['f1'])

    svm_clf.fit(codings_train, train_y)
    pred = svm_clf.predict(codings_test)
    svm2_scores = {
        'accuracy': accuracy_score(test_y, pred),
        'f1': f1_score(test_y, pred, average='macro')
    }
    print('SVM2 -- ACC:', svm2_scores['accuracy'], 'F1:', svm2_scores['f1'])

    birnn_results.append({
        'model': 'BiGRUx2/relu/mse/adam',
        'dataset': data_name,
        'RF1-ACC': rf1_scores['accuracy'],
        'RF1-F1': rf1_scores['f1'],
        'RF2-ACC': rf2_scores['accuracy'],
        'RF2-F1': rf2_scores['f1'],
        'SVM1-ACC': svm1_scores['accuracy'],
        'SVM1-F1': svm1_scores['f1'],
        'SVM2-ACC': svm2_scores['accuracy'],
        'SVM2-F1': svm2_scores['f1']
    })


for dataset in list_datasets()[0]:
    evaluate(dataset)

pd.DataFrame(rnn_results).to_csv('./uni_rnn_results.csv', index=False)
pd.DataFrame(birnn_results).to_csv('./uni_birnn_results.csv', index=False)
Example #11
0
              and builder_instance.info.size_in_bytes < MAX_SIZE):
            builder_instance.download_and_prepare()
            dts = builder_instance.as_dataset()
            dataset = dts
        else:
            dataset = builder_instance
            fail = True
        return dataset, fail

    # Dataset select box.
    dataset_names = []
    selection = None

    import glob
    if path_to_datasets is None:
        list_of_datasets = datasets.list_datasets(
            with_community_datasets=False)
    else:
        list_of_datasets = sorted(glob.glob(path_to_datasets + "*"))
    print(list_of_datasets)
    for i, dataset in enumerate(list_of_datasets):
        dataset = dataset.split("/")[-1]
        if INITIAL_SELECTION and dataset == INITIAL_SELECTION:
            selection = i
        dataset_names.append(dataset)

    if selection is not None:
        option = st.sidebar.selectbox("Dataset",
                                      dataset_names,
                                      index=selection,
                                      format_func=lambda a: a)
    else:
-努力使用大型数据集:使您摆脱RAM内存限制,默认情况下所有数据集都映射到驱动器上的内存。
-具有类似tf.data`的智能缓存的智能缓存: 永不等待您的数据处理多次

“🤗Datasets”源自于很棒的Tensorflow-Datasets的分支,而HuggingFace团队想对这个令人惊叹的库和用户API背后的团队深表感谢。
我们试图与tfds保持兼容性,并且转换可以提供从一种格式到另一种格式的转换。

"""
# pip install datasets

#让我们导入库。 我们通常最多只需要四种方法:
from datasets import list_datasets, list_metrics, load_dataset, load_metric

from pprint import pprint

# 当前可用的数据集和指标
datasets = list_datasets()
metrics = list_metrics()

print(f"🤩 Currently {len(datasets)} datasets are available on the hub:")
pprint(datasets, compact=True)
print(f"🤩 Currently {len(metrics)} metrics are available on the hub:")
pprint(metrics, compact=True)

#您可以在下载数据集之前访问它们的各种属性
squad_dataset = list_datasets(with_details=True)[datasets.index('squad')]

pprint(squad_dataset.__dict__)  # It's a simple python dataclass

# SQuAD的样本

#下载和加载数据集
Example #13
0
        'f1': f1_score(test_y, pred, average='macro')
    }
    print('SVM1 -- ACC:', svm1_scores['accuracy'], 'F1:', svm1_scores['f1'])

    svm_clf.fit(codings_train, train_y)
    pred = svm_clf.predict(codings_test)
    svm2_scores = {
        'accuracy': accuracy_score(test_y, pred),
        'f1': f1_score(test_y, pred, average='macro')
    }
    print('SVM2 -- ACC:', svm2_scores['accuracy'], 'F1:', svm2_scores['f1'])

    birnn_results.append({
        'model': 'BiGRUx2/relu/mse/adam',
        'dataset': data_name,
        'RF1-ACC': rf1_scores['accuracy'],
        'RF1-F1': rf1_scores['f1'],
        'RF2-ACC': rf2_scores['accuracy'],
        'RF2-F1': rf2_scores['f1'],
        'SVM1-ACC': svm1_scores['accuracy'],
        'SVM1-F1': svm1_scores['f1'],
        'SVM2-ACC': svm2_scores['accuracy'],
        'SVM2-F1': svm2_scores['f1']
    })


for dataset in list_datasets()[1]:
    evaluate(dataset)

pd.DataFrame(rnn_results).to_csv('./mul_rnn_results.csv', index=False)
pd.DataFrame(birnn_results).to_csv('./mul_birnn_results.csv', index=False)