Example #1
0
def _setup_datasets(dataset_name, root, ngrams, vocab, tokenizer, split_):
    text_transform = []
    if tokenizer is None:
        tokenizer = get_tokenizer("basic_english")
    text_transform = sequential_transforms(tokenizer, ngrams_func(ngrams))
    split = check_default_set(split_, ('train', 'test'), dataset_name)
    raw_datasets = raw.DATASETS[dataset_name](root=root, split=split)
    # Materialize raw text iterable dataset
    raw_data = {
        name: list(raw_dataset)
        for name, raw_dataset in zip(split, raw_datasets)
    }

    if vocab is None:
        if "train" not in split:
            raise TypeError("Must pass a vocab if train is not selected.")
        logger_.info('Building Vocab based on train data')
        vocab = build_vocab(raw_data["train"], text_transform)
    logger_.info('Vocab has %d entries', len(vocab))
    text_transform = sequential_transforms(text_transform, vocab_func(vocab),
                                           totensor(dtype=torch.long))
    if dataset_name == 'IMDB':
        label_transform = sequential_transforms(
            lambda x: 1 if x == 'pos' else 0, totensor(dtype=torch.long))
    else:
        label_transform = sequential_transforms(totensor(dtype=torch.long))
    logger_.info('Building datasets for {}'.format(split))
    return wrap_datasets(
        tuple(
            TextClassificationDataset(raw_data[item], vocab, (label_transform,
                                                              text_transform))
            for item in split), split_)
Example #2
0
def _setup_datasets(dataset_name, root, ngrams, vocab, tokenizer, data_select):
    text_transform = []
    if tokenizer is None:
        tokenizer = get_tokenizer("basic_english")
    text_transform = sequential_transforms(tokenizer, ngrams_func(ngrams))
    data_select = check_default_set(data_select, ('train', 'test'))
    raw_datasets = raw.DATASETS[dataset_name](root=root,
                                              data_select=data_select)
    # Materialize raw text iterable dataset
    raw_data = {
        name: list(raw_dataset)
        for name, raw_dataset in zip(data_select, raw_datasets)
    }

    if vocab is None:
        if "train" not in data_select:
            raise TypeError("Must pass a vocab if train is not selected.")
        vocab = build_vocab(raw_data["train"], text_transform)
    text_transform = sequential_transforms(text_transform, vocab_func(vocab),
                                           totensor(dtype=torch.long))
    if dataset_name == 'IMDB':
        label_transform = sequential_transforms(
            lambda x: 1 if x == 'pos' else 0, totensor(dtype=torch.long))
    else:
        label_transform = sequential_transforms(totensor(dtype=torch.long))
    return tuple(
        TextClassificationDataset(raw_data[item], vocab, (label_transform,
                                                          text_transform))
        for item in data_select)
Example #3
0
def niid_device(params):
    num_user = params['Trainer']['n_clients']
    dataset_user = params['Dataset']['user']
    assert num_user == dataset_user # should be exact same
    usernames = list(dict(df[4].value_counts()))[:dataset_user]
    df_small = df.loc[df[4].isin(usernames)]
    df_small = df_small.sample(frac=1) # shuffle all the data
    df_train = df_small.iloc[:int(df_small.shape[0] * 0.9), :]
    df_test = df_small.iloc[int(df_small.shape[0] * 0.9):, :]
    text_transform = sequential_transforms(
        str.lower, 
        get_tokenizer("basic_english"),
    )
    counter = Counter(dict(
        get_vocab_counter(df_train[5], text_transform).most_common(3000 - 2)
    ))
    vocab = Vocab(
        counter, 
        vectors='glove.6B.300d', 
        vectors_cache='./data/vector_cache/',
    )
    text_transform = sequential_transforms(
        text_transform, 
        vocab_func(vocab), 
        totensor(dtype=torch.long), 
    )
    label_transform = sequential_transforms(totensor(dtype=torch.long))
    data_test = list(zip(df_test[0], df_test[5]))
    test_dataset = TextClassificationDataset(
        data_test, 
        vocab, 
        (label_transform, text_transform),
    )
    # pandas is easy to split
    #data_train = list(zip(df_train[0], df_train[5]))
    #train_dataset = TextClassificationDataset(data_train, vocab, (label_transform, text_transform))
    dataset_split = []
    for username in usernames:
        split_train = df_small.loc[df_small[4] == username]
        split_train = list(zip(split_train[0], split_train[5]))
        dataset_split.append(
            {
                'train': TextClassificationDataset(
                    split_train, 
                    vocab, 
                    (label_transform, text_transform),
                ),
                'test': None, 
            }
        )
    for item in dataset_split: item['vocab'] = vocab
    testset_dict = {
        'train': None,
        'test': test_dataset,
        'vocab': vocab,
    }
    return dataset_split, testset_dict
Example #4
0
def process_raw_data(raw_data, tokenizer, vocab):
    raw_data = [(label, text) for (label, text) in raw_data]
    text_transform = sequential_transforms(tokenizer.tokenize,
                                           vocab_func(vocab),
                                           totensor(dtype=torch.long))
    label_transform = sequential_transforms(totensor(dtype=torch.long))
    transforms = (label_transform, text_transform)
    dataset = TextClassificationDataset(raw_data, vocab, transforms)
    return dataset
Example #5
0
def _setup_datasets(dataset_name, root, vocabs, data_select):
    data_select = check_default_set(data_select, ('train', 'valid', 'test'))
    raw_iter_tuple = raw.DATASETS[dataset_name](root=root,
                                                data_select=data_select)
    raw_data = {}
    for name, raw_iter in zip(data_select, raw_iter_tuple):
        raw_data[name] = list(raw_iter)

    if vocabs is None:
        if "train" not in data_select:
            raise TypeError("Must pass a vocab if train is not selected.")
        vocabs = build_vocab(raw_data["train"])
    else:
        if not isinstance(vocabs, list):
            raise TypeError("vocabs must be an instance of list")

        # Find data that's not None
        notnone_data = None
        for key in raw_data.keys():
            if raw_data[key] is not None:
                notnone_data = raw_data[key]
                break
        if len(vocabs) != len(notnone_data[0]):
            raise ValueError(
                "Number of vocabs must match the number of columns "
                "in the data")

    transformers = [
        sequential_transforms(vocab_func(vocabs[idx]),
                              totensor(dtype=torch.long))
        for idx in range(len(vocabs))
    ]
    return tuple(
        SequenceTaggingDataset(raw_data[item], vocabs, transformers)
        for item in data_select)
Example #6
0
def _setup_datasets(dataset_name,
                    root='.data',
                    vocab=None,
                    tokenizer=None,
                    data_select=('train', 'dev')):
    text_transform = []
    if tokenizer is None:
        tokenizer = get_tokenizer('basic_english')
    text_transform = sequential_transforms(tokenizer)
    if isinstance(data_select, str):
        data_select = [data_select]
    if not set(data_select).issubset(set(('train', 'dev'))):
        raise TypeError(
            'Given data selection {} is not supported!'.format(data_select))
    train, dev = raw.DATASETS[dataset_name](root=root)
    raw_data = {
        'train': [item for item in train],
        'dev': [item for item in dev]
    }
    if vocab is None:
        if 'train' not in data_select:
            raise TypeError("Must pass a vocab if train is not selected.")

        def apply_transform(data):
            for (_context, _question, _answers, _ans_pos) in data:
                tok_ans = []
                for item in _answers:
                    tok_ans += text_transform(item)
                yield text_transform(_context) + text_transform(
                    _question) + tok_ans

        vocab = build_vocab_from_iterator(apply_transform(raw_data['train']),
                                          len(raw_data['train']))
    text_transform = sequential_transforms(text_transform, vocab_func(vocab),
                                           totensor(dtype=torch.long))
    transforms = {
        'context': text_transform,
        'question': text_transform,
        'answers': text_transform,
        'ans_pos': totensor(dtype=torch.long)
    }
    return tuple(
        QuestionAnswerDataset(raw_data[item], vocab, transforms)
        for item in data_select)
Example #7
0
def _setup_datasets(dataset_name, root, vocab, tokenizer, split_):
    text_transform = []
    if tokenizer is None:
        tokenizer = get_tokenizer('basic_english')
    text_transform = sequential_transforms(tokenizer)
    split = _check_default_set(split_, ('train', 'dev'), dataset_name)
    raw_datasets = raw.DATASETS[dataset_name](root=root, split=split)
    raw_data = {
        name: list(raw_dataset)
        for name, raw_dataset in zip(split, raw_datasets)
    }
    if vocab is None:
        if 'train' not in split:
            raise TypeError("Must pass a vocab if train is not selected.")

        def apply_transform(data):
            for (_context, _question, _answers, _ans_pos) in data:
                tok_ans = []
                for item in _answers:
                    tok_ans += text_transform(item)
                yield text_transform(_context) + text_transform(
                    _question) + tok_ans

        logger_.info('Building Vocab based on train data')
        vocab = build_vocab_from_iterator(apply_transform(raw_data['train']),
                                          specials=['<unk>', '<pad>'])
        vocab.set_default_index(vocab['<unk>'])
    logger_.info('Vocab has %d entries', len(vocab))
    text_transform = sequential_transforms(text_transform, vocab_func(vocab),
                                           totensor(dtype=torch.long))
    transforms = {
        'context': text_transform,
        'question': text_transform,
        'answers': text_transform,
        'ans_pos': totensor(dtype=torch.long)
    }
    logger_.info('Building datasets for {}'.format(split))
    return _wrap_datasets(
        tuple(
            QuestionAnswerDataset(raw_data[item], vocab, transforms)
            for item in split), split_)
def load_imdb(review, score, vocab): 
    print(f'loading imdb text and score data')
    with open(review) as f:
        text = [tokenize(line, max_length) for line in f.readlines()]
    with open(score) as f:
        score = [] 
        for real_scroe in f.readlines() : 
            if int(real_scroe) >= 6 : 
                score.append(0) # positive
            else : 
                score.append(1) # negative

    text_transform = sequential_transforms(
        vocab_func(vocab),
        totensor(torch.long)
    )
    label_transform = sequential_transforms(
        totensor(torch.long)
    )
    dataset = TextClassificationDataset(list(zip(score, text)), vocab, (label_transform, text_transform))
    return dataset
Example #9
0
def _setup_datasets(
        dataset_name,
        root=".data",
        ngrams=1,
        vocab=None,
        tokenizer=None,
        data_select=("train", "test"),
):
    text_transform = []
    if tokenizer is None:
        tokenizer = get_tokenizer("basic_english")
    text_transform = sequential_transforms(tokenizer, ngrams_func(ngrams))

    if isinstance(data_select, str):
        data_select = [data_select]
    if not set(data_select).issubset(set(("train", "test"))):
        raise TypeError(
            "Given data selection {} is not supported!".format(data_select))
    train, test = raw.DATASETS[dataset_name](root=root)
    # Cache raw text iterable dataset
    raw_data = {
        "train": [(label, txt) for (label, txt) in train],
        "test": [(label, txt) for (label, txt) in test],
    }

    if vocab is None:
        if "train" not in data_select:
            raise TypeError("Must pass a vocab if train is not selected.")
        vocab = build_vocab(raw_data["train"], text_transform)
    text_transform = sequential_transforms(text_transform, vocab_func(vocab),
                                           totensor(dtype=torch.long))
    if dataset_name == 'IMDB':
        label_transform = sequential_transforms(
            lambda x: 1 if x == 'pos' else 0, totensor(dtype=torch.long))
    else:
        label_transform = sequential_transforms(totensor(dtype=torch.long))
    return tuple(
        TextClassificationDataset(raw_data[item], vocab, (label_transform,
                                                          text_transform))
        for item in data_select)
Example #10
0
def build_legacy_torchtext_vocab_pipeline(vocab_file):
    tokenizer = get_tokenizer("basic_english")
    from torchtext.vocab import build_vocab_from_iterator

    def token_iterator(vocab_file):
        f = open(vocab_file, 'r')
        for line in f:
            for token in line:
                yield token

    vocab = build_vocab_from_iterator(token_iterator(vocab_file))
    pipeline = sequential_transforms(tokenizer, vocab_func(vocab))
    return pipeline, None, None
Example #11
0
def build_torchtext_vocab(vocab_file):
    from torchtext.data.utils import get_tokenizer
    tokenizer = get_tokenizer("basic_english")
    from torchtext.vocab import build_vocab_from_iterator
    from torchtext.experimental.functional import totensor, vocab_func, sequential_transforms

    def token_iterator(vocab_file):
        f = open(vocab_file, 'r')
        for token in f:
            yield token
    vocab = build_vocab_from_iterator(token_iterator(vocab_file))
    pipeline = sequential_transforms(tokenizer, vocab_func(vocab), totensor(dtype=torch.long))
    return pipeline, None, None
Example #12
0
def _setup_datasets(dataset_name, root, vocab, tokenizer, data_select):
    text_transform = []
    if tokenizer is None:
        tokenizer = get_tokenizer('basic_english')
    text_transform = sequential_transforms(tokenizer)
    data_select = check_default_set(data_select, ('train', 'dev'))
    raw_datasets = raw.DATASETS[dataset_name](root=root, data_select=data_select)
    raw_data = {name: list(raw_dataset) for name, raw_dataset in zip(data_select, raw_datasets)}
    if vocab is None:
        if 'train' not in data_select:
            raise TypeError("Must pass a vocab if train is not selected.")

        def apply_transform(data):
            for (_context, _question, _answers, _ans_pos) in data:
                tok_ans = []
                for item in _answers:
                    tok_ans += text_transform(item)
                yield text_transform(_context) + text_transform(_question) + tok_ans
        vocab = build_vocab_from_iterator(apply_transform(raw_data['train']), len(raw_data['train']))
    text_transform = sequential_transforms(text_transform, vocab_func(vocab), totensor(dtype=torch.long))
    transforms = {'context': text_transform, 'question': text_transform,
                  'answers': text_transform, 'ans_pos': totensor(dtype=torch.long)}
    return tuple(QuestionAnswerDataset(raw_data[item], vocab, transforms) for item in data_select)
Example #13
0
def build_legacy_pytext_vocab_pipeline(vocab_file):
    from pytext.data.utils import Vocabulary

    tokenizer = get_tokenizer("basic_english")
    f = open(vocab_file, 'r')

    vocab_counter = Counter([token for line in f for token in line.rstrip()])
    sorted_by_freq_tuples = sorted(vocab_counter.items(), key=lambda x: x[1], reverse=True)
    vocab_list = [pair[0] for pair in sorted_by_freq_tuples]
    vocab_list.insert(0, "<unk>")

    pipeline = sequential_transforms(tokenizer_func(tokenizer),
                                     PyTextVocabTransform(Vocabulary(vocab_list, unk_token="<unk>")))
    return pipeline, None, None
Example #14
0
def _setup_datasets(dataset_name,
                    root=".data",
                    vocabs=None,
                    data_select=("train", "valid", "test")):
    if isinstance(data_select, str):
        data_select = [data_select]
    if not set(data_select).issubset(set(("train", "valid", "test"))):
        raise TypeError(
            "Given data selection {} is not supported!".format(data_select))

    train, val, test = raw.DATASETS[dataset_name](root=root)
    raw_data = {
        "train": [line for line in train] if train else None,
        "valid": [line for line in val] if val else None,
        "test": [line for line in test] if test else None
    }

    if vocabs is None:
        if "train" not in data_select:
            raise TypeError("Must pass a vocab if train is not selected.")
        vocabs = build_vocab(raw_data["train"])
    else:
        if not isinstance(vocabs, list):
            raise TypeError("vocabs must be an instance of list")

        # Find data that's not None
        notnone_data = None
        for key in raw_data.keys():
            if raw_data[key] is not None:
                notnone_data = raw_data[key]
                break
        if len(vocabs) != len(notnone_data[0]):
            raise ValueError(
                "Number of vocabs must match the number of columns "
                "in the data")

    transformers = [
        sequential_transforms(vocab_func(vocabs[idx]),
                              totensor(dtype=torch.long))
        for idx in range(len(vocabs))
    ]

    datasets = []
    for item in data_select:
        if raw_data[item] is not None:
            datasets.append(
                SequenceTaggingDataset(raw_data[item], vocabs, transformers))

    return datasets
def load_dataset(directory, dev_ratio=None, using_vocab=None): #사용자 지정 함수 , directory 어디서 읽어 올거야, dev_ratio: ?, 
    print(f'loading files in {directory}')
    text = []
    labels = []
    classes = os.listdir(directory) # [neg, pos]. 해당 directory에 있는 또다른 directory를 가지고 온다. 
    for directory_name in classes:
        for filename in tqdm.tqdm(os.listdir(f'{directory}/{directory_name}'), desc=f'loading {directory_name}'):
            with open(f'{directory}/{directory_name}/{filename}', encoding='utf-8') as f:
                tokens = tokenize(f.read(), max_length)
                text.append(tokens)
                labels.append(directory_name)

    if dev_ratio is not None:
        text, dev_text, labels, dev_labels = train_test_split(text, labels, test_size=0.1)

    if using_vocab is None:
        using_vocab = make_vocab(text, vocab_size)

    text_transform = sequential_transforms(
        vocab_func(using_vocab),
        totensor(torch.long)
    )
    label_map = {name: index for index, name in enumerate(classes)}
    print(label_map)
    label_transform = sequential_transforms(
        lambda label: label_map[label],
        totensor(torch.long)
    )

    dataset = TextClassificationDataset(list(zip(labels, text)), using_vocab, (label_transform, text_transform))

    if dev_ratio is not None:
        dev_dataset = TextClassificationDataset(list(zip(dev_labels, dev_text)), using_vocab, (label_transform, text_transform))
        return dataset, dev_dataset
    else:
        return dataset
Example #16
0
def build_legacy_batch_torchtext_vocab_pipeline(vocab_file):
    tokenizer = get_tokenizer("basic_english")
    from torchtext.vocab import build_vocab_from_iterator
    from transforms import TextClassificationPipeline

    def token_iterator(vocab_file):
        f = open(vocab_file, 'r')
        for line in f:
            for token in line:
                yield token

    vocab = build_vocab_from_iterator(token_iterator(vocab_file))
    text_pipeline = sequential_transforms(tokenizer, vocab_func(vocab))
    label_pipeline = totensor(dtype=torch.long)
    return TextClassificationPipeline(label_pipeline, text_pipeline), None, None
Example #17
0
def niid(params):
    num_user = params['Trainer']['n_clients']
    dataset_frac = params['Dataset']['frac']
    s = params['Dataset']['s']
    df_small = df.sample(frac=dataset_frac) # sample & shuffle
    df_train = df_small.iloc[:int(df_small.shape[0] * 0.9), :]
    df_test = df_small.iloc[int(df_small.shape[0] * 0.9):, :]
    text_transform = sequential_transforms(
        str.lower, 
        get_tokenizer("basic_english"),
    )
    counter = Counter(dict(
        get_vocab_counter(df_train[5], text_transform).most_common(3000 - 2)
    ))
    vocab = Vocab(
        counter, 
        vectors='glove.6B.300d', 
        vectors_cache='./data/vector_cache/',
    )
    text_transform = sequential_transforms(
        text_transform, 
        vocab_func(vocab), 
        totensor(dtype=torch.long), 
    )
    label_transform = sequential_transforms(totensor(dtype=torch.long))
    data_test = list(zip(df_test[0], df_test[5]))
    test_dataset = TextClassificationDataset(
        data_test, 
        vocab, 
        (label_transform, text_transform),
    )
    # pandas is easy to split
    #data_train = list(zip(df_train[0], df_train[5]))
    #train_dataset = TextClassificationDataset(data_train, vocab, (label_transform, text_transform))
    df_train_iid = df_train.iloc[:int(s * df_train.shape[0]), :]
    df_train_niid = df_train.iloc[int(s * df_train.shape[0]):, :].sort_values([0])
    p_train_iid = 0
    p_train_niid = 0
    delta_train_iid = df_train_iid.shape[0] // num_user
    delta_train_niid = df_train_niid.shape[0] // num_user
    dataset_split = []
    for userid in range(num_user):
        train_lst = []
        if delta_train_iid > 0:
            train_lst.append(
                df_train_iid[
                    p_train_iid: p_train_iid + delta_train_iid
                ]
            )
        if delta_train_niid > 0:
            train_lst.append(
                df_train_niid[
                    p_train_niid: p_train_niid + delta_train_niid
                ]
            )
        split_train = pd.concat(train_lst)
        split_train = list(zip(split_train[0], split_train[5]))
        dataset_split.append(
            {
                'train': TextClassificationDataset(
                    split_train, 
                    vocab, 
                    (label_transform, text_transform),
                ),
                'test': None, 
            }
        )
        p_train_iid += delta_train_iid
        p_train_niid += delta_train_niid
    for item in dataset_split: item['vocab'] = vocab
    testset_dict = {
        'train': None,
        'test': test_dataset,
        'vocab': vocab,
    }
    return dataset_split, testset_dict
Example #18
0
def build_legacy_fasttext_vector_pipeline():
    tokenizer = get_tokenizer("basic_english")
    vector = FastText()

    pipeline = sequential_transforms(tokenizer, vector.get_vecs_by_tokens)
    return pipeline, None, None
Example #19
0
from torchtext.experimental.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from torchtext.experimental.functional import sequential_transforms, vocab_func, totensor

# load data from whatever format it's saved in to an iterable of (label, text)
my_data = [('pos', 'this film is great'), ('neg', 'this film is bad'),
           ('neg', 'this film is awful')]

# tokenizer can be any callable function that goes from str -> list[str]
my_tokenizer = get_tokenizer('basic_english')

# build vocabulary from data
my_vocab = build_vocab_from_iterator(
    [my_tokenizer(text) for label, text in my_data])

# how should the label be transformed?
# str -> int -> LongTensor
label_transforms = sequential_transforms(lambda x: 1 if x == 'pos' else 0,
                                         totensor(torch.long))

# how should the text be transformed?
# str -> list[str] -> list[int] -> LongTensor
text_transforms = sequential_transforms(my_tokenizer, vocab_func(my_vocab),
                                        totensor(torch.long))

# tuple the transforms
my_transforms = (label_transforms, text_transforms)

# create TextClassificationDataset with data, vocabulary and transforms
dataset = TextClassificationDataset(my_data, my_vocab, my_transforms)
Example #20
0
def _setup_datasets(dataset_name,
                    tokenizer=None,
                    root='.data',
                    vocab=None,
                    data_select=('train', 'test', 'valid'),
                    single_line=True):
    if tokenizer is None:
        tokenizer = get_tokenizer('basic_english')
    text_transform = sequential_transforms(tokenizer)

    if isinstance(data_select, str):
        data_select = [data_select]
    if not set(data_select).issubset(set(('train', 'valid', 'test'))):
        raise TypeError(
            'Given data selection {} is not supported!'.format(data_select))

    if not single_line and dataset_name != 'WikiText103':
        raise TypeError('single_line must be True except for WikiText103')
    if dataset_name == 'WMTNewsCrawl':
        train, = raw.DATASETS[dataset_name](root=root, data_select=('train', ))
        if single_line:
            raw_data = {
                'train': [
                    " ".join([txt for txt in train]),
                ]
            }
        else:
            raw_data = {'train': [txt for txt in train]}
    else:
        train, test, valid = raw.DATASETS[dataset_name](root=root,
                                                        data_select=('train',
                                                                     'test',
                                                                     'valid'))
        # Cache raw text iterable dataset
        if single_line:
            raw_data = {
                'train': [
                    " ".join([txt for txt in train]),
                ],
                'valid': [
                    " ".join(txt for txt in valid),
                ],
                'test': [
                    " ".join(txt for txt in test),
                ]
            }
        else:
            raw_data = {
                'train': [txt for txt in train],
                'valid': [txt for txt in valid],
                'test': [txt for txt in test]
            }

    if vocab is None:
        if 'train' not in data_select:
            raise TypeError("Must pass a vocab if train is not selected.")
        vocab = build_vocab(raw_data['train'], text_transform)
    text_transform = sequential_transforms(text_transform, vocab_func(vocab),
                                           totensor(dtype=torch.long))
    return tuple(
        LanguageModelingDataset(raw_data[item], vocab, text_transform,
                                single_line) for item in data_select)