コード例 #1
0
ファイル: test_field.py プロジェクト: isabella232/text-3
    def test_init(self):
        # basic init
        label_field = data.LabelField()
        assert label_field.sequential is False
        assert label_field.unk_token is None

        # init with preset fields
        label_field = data.LabelField(sequential=True, unk_token="<unk>")
        assert label_field.sequential is False
        assert label_field.unk_token is None
コード例 #2
0
ファイル: load_data.py プロジェクト: sharan21/vae-exps-3
def load_dataset(test_sen=None):
    """
    tokenizer : Breaks sentences into a list of words. If sequential=False, no tokenization is applied
    Field : A class that stores information about the way of preprocessing
    fix_length : An important property of TorchText is that we can let the input to be variable length, and TorchText will
                 dynamically pad each sequence to the longest sequence in that "batch". But here we are using fi_length which
                 will pad each sequence to have a fix length of 200.
                 
    build_vocab : It will first make a vocabulary or dictionary mapping all the unique words present in the train_data to an
                  idx and then after it will use GloVe word embedding to map the index to the corresponding word embedding.
                  
    vocab.vectors : This returns a torch tensor of shape (vocab_size x embedding_dim) containing the pre-trained word embeddings.
    BucketIterator : Defines an iterator that batches examples of similar lengths together to minimize the amount of padding needed.
    
    """

    tokenize = lambda x: x.split()
    TEXT = data.Field(sequential=True,
                      tokenize=tokenize,
                      lower=True,
                      include_lengths=True,
                      batch_first=True,
                      fix_length=70)
    LABEL = data.LabelField()
    train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
    TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=300))
    LABEL.build_vocab(train_data)

    word_embeddings = TEXT.vocab.vectors
    print("Length of Text Vocabulary: " + str(len(TEXT.vocab)))
    print("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size())
    print("Label Length: " + str(len(LABEL.vocab)))

    train_data, valid_data = train_data.split(
    )  # Further splitting of training_data to create new training_data & validation_data
    train_iter, valid_iter, test_iter = data.BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size=32,
        sort_key=lambda x: len(x.text),
        repeat=False,
        shuffle=True)
    '''Alternatively we can also use the default configurations'''
    # train_iter, test_iter = datasets.IMDB.iters(batch_size=32)

    vocab_size = len(TEXT.vocab)
    # print(TEXT.vocab.stoi["<eos>"])
    # print(TEXT.vocab.stoi["<sos>"])
    # exit(0)

    return TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter
コード例 #3
0
ファイル: test_field.py プロジェクト: isabella232/text-3
    def test_vocab_size(self):
        # Set up fields
        question_field = data.Field(sequential=True)
        label_field = data.LabelField()

        # Copied from test_build_vocab with minor changes
        # Write TSV dataset and construct a Dataset
        self.write_test_ppid_dataset(data_format="tsv")
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", label_field)]
        tsv_dataset = data.TabularDataset(
            path=self.test_ppid_dataset_path, format="tsv",
            fields=tsv_fields)

        # Skipping json dataset as we can rely on the original build vocab test
        label_field.build_vocab(tsv_dataset)
        assert label_field.vocab.freqs == Counter({'1': 2, '0': 1})
        expected_stoi = {'1': 0, '0': 1}  # No <unk>
        assert dict(label_field.vocab.stoi) == expected_stoi
        # Turn the stoi dictionary into an itos list
        expected_itos = [x[0] for x in sorted(expected_stoi.items(),
                                              key=lambda tup: tup[1])]
        assert label_field.vocab.itos == expected_itos
コード例 #4
0
def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_input_length - 2]
    return tokens


MYTEXT = data.Field(batch_first=True,
                    use_vocab=False,
                    tokenize=tokenize_and_cut,
                    preprocessing=tokenizer.convert_tokens_to_ids,
                    init_token=init_token_idx,
                    eos_token=eos_token_idx,
                    pad_token=pad_token_idx,
                    unk_token=unk_token_idx)
MYSENTIMENT = data.LabelField(dtype=torch.float)

#INFERENCE


def predict_sentiment(model, tokenizer, sentence):
    model.eval()
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_input_length - 2]
    indexed = [init_token_idx
               ] + tokenizer.convert_tokens_to_ids(tokens) + [eos_token_idx]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()
fpLabelP3Train = fopOutputML + 'train.label.p3.txt'

# fpValid = fopRoot + 'valid.csv'
# fpTextValid = fopRoot + 'testP.text.txt'
fpLabelP1Valid = fopOutputML + 'testP.label.p1.txt'
fpLabelP2Valid = fopOutputML + 'testP.label.p2.txt'
fpLabelP3Valid = fopOutputML + 'testP.label.p3.txt'

# fpTest = fopRoot + 'test.csv'
# fpTextTest = fopRoot + 'testW.text.txt'
fpLabelP1Test = fopOutputML + 'testW.label.p1.txt'
fpLabelP2Test = fopOutputML + 'testW.label.p2.txt'
fpLabelP3Test = fopOutputML + 'testW.label.p3.txt'
sys.stdout = open(fpResultDetails, 'w')
TEXT = data.Field(tokenize='spacy', batch_first=True, include_lengths=True)
LABEL = data.LabelField(dtype=torch.long, batch_first=True, use_vocab=True)
fields = [('label', LABEL), ('text', TEXT)]
# loading custom dataset p1
train_data = data.TabularDataset(path=fpLabelP1Train,
                                 format='csv',
                                 fields=fields,
                                 skip_header=True)
valid_data = data.TabularDataset(path=fpLabelP1Valid,
                                 format='csv',
                                 fields=fields,
                                 skip_header=True)
test_data = data.TabularDataset(path=fpLabelP1Test,
                                format='csv',
                                fields=fields,
                                skip_header=True)
acc_p1 = trainAndEval(train_data, valid_data, test_data)
コード例 #6
0
from torchtext.legacy import data
from torchtext.legacy import datasets
import random
import numpy as np

SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize='spacy',
                  tokenizer_language='en_core_web_sm',
                  batch_first=True)

LABEL = data.LabelField(dtype=torch.float)
# 데이터 정의함

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
# 데이터를 위에서 정의한 형식대로 train,test 데이터 불러오기
train_data, valid_data = train_data.split(random_state=random.seed(SEED))
# 데이터 분리

MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data,
                 max_size=MAX_VOCAB_SIZE,
                 vectors="glove.6B.50d",
                 unk_init=torch.Tensor.normal_)

LABEL.build_vocab(train_data)
コード例 #7
0
ファイル: test_dataset.py プロジェクト: zkneupper/text
    def test_stratified_dataset_split(self):
        num_examples, num_labels = 30, 3
        self.write_test_splitting_dataset(num_examples=num_examples,
                                          num_labels=num_labels)
        text_field = data.Field()
        label_field = data.LabelField()
        fields = [('text', text_field), ('label', label_field)]

        dataset = data.TabularDataset(path=self.test_dataset_splitting_path,
                                      format="csv",
                                      fields=fields)

        # Default split ratio
        expected_train_size = 21
        expected_test_size = 9

        train, test = dataset.split(stratified=True)
        assert len(train) == expected_train_size
        assert len(test) == expected_test_size

        # Test array arguments with same ratio
        split_ratio = [0.7, 0.3]
        train, test = dataset.split(split_ratio=split_ratio, stratified=True)
        assert len(train) == expected_train_size
        assert len(test) == expected_test_size

        # Test strata_field argument
        train, test = dataset.split(split_ratio=split_ratio,
                                    stratified=True,
                                    strata_field='label')
        assert len(train) == expected_train_size
        assert len(test) == expected_test_size

        # Test invalid field name
        strata_field = 'dummy'
        with pytest.raises(ValueError):
            dataset.split(split_ratio=split_ratio,
                          stratified=True,
                          strata_field=strata_field)

        # Test uneven stratify sizes
        num_examples, num_labels = 28, 3
        self.write_test_splitting_dataset(num_examples=num_examples,
                                          num_labels=num_labels)
        # 10 examples for class 1 and 9 examples for classes 2,3
        dataset = data.TabularDataset(path=self.test_dataset_splitting_path,
                                      format="csv",
                                      fields=fields)

        expected_train_size = 7 + 6 + 6
        expected_test_size = 3 + 3 + 3
        train, test = dataset.split(split_ratio=split_ratio, stratified=True)
        assert len(train) == expected_train_size
        assert len(test) == expected_test_size

        split_ratio = [0.7, 0.3]
        train, test = dataset.split(split_ratio=split_ratio, stratified=True)
        assert len(train) == expected_train_size
        assert len(test) == expected_test_size

        # Add validation set
        split_ratio = [0.6, 0.3, 0.1]
        expected_train_size = 6 + 5 + 5
        expected_valid_size = 1 + 1 + 1
        expected_test_size = 3 + 3 + 3
        train, valid, test = dataset.split(split_ratio=split_ratio,
                                           stratified=True)
        assert len(train) == expected_train_size
        assert len(valid) == expected_valid_size
        assert len(test) == expected_test_size
コード例 #8
0
ファイル: test_dataset.py プロジェクト: zkneupper/text
    def test_dataset_split_arguments(self):
        num_examples, num_labels = 30, 3
        self.write_test_splitting_dataset(num_examples=num_examples,
                                          num_labels=num_labels)
        text_field = data.Field()
        label_field = data.LabelField()
        fields = [('text', text_field), ('label', label_field)]

        dataset = data.TabularDataset(path=self.test_dataset_splitting_path,
                                      format="csv",
                                      fields=fields)

        # Test default split ratio (0.7)
        expected_train_size = 21
        expected_test_size = 9

        train, test = dataset.split()
        assert len(train) == expected_train_size
        assert len(test) == expected_test_size

        # Test array arguments with same ratio
        split_ratio = [0.7, 0.3]
        train, test = dataset.split(split_ratio=split_ratio)
        assert len(train) == expected_train_size
        assert len(test) == expected_test_size

        # Add validation set
        split_ratio = [0.6, 0.3, 0.1]
        expected_train_size = 18
        expected_valid_size = 3
        expected_test_size = 9

        train, valid, test = dataset.split(split_ratio=split_ratio)
        assert len(train) == expected_train_size
        assert len(valid) == expected_valid_size
        assert len(test) == expected_test_size

        # Test ratio normalization
        split_ratio = [6, 3, 1]
        train, valid, test = dataset.split(split_ratio=split_ratio)
        assert len(train) == expected_train_size
        assert len(valid) == expected_valid_size
        assert len(test) == expected_test_size

        # Test only two splits returned for too small valid split size
        split_ratio = [0.66, 0.33, 0.01]
        expected_length = 2
        splits = dataset.split(split_ratio=split_ratio)
        assert len(splits) == expected_length

        # Test invalid arguments
        split_ratio = 1.1
        with pytest.raises(AssertionError):
            dataset.split(split_ratio=split_ratio)

        split_ratio = -1.
        with pytest.raises(AssertionError):
            dataset.split(split_ratio=split_ratio)

        split_ratio = [0.7]
        with pytest.raises(AssertionError):
            dataset.split(split_ratio=split_ratio)

        split_ratio = [1, 2, 3, 4]
        with pytest.raises(AssertionError):
            dataset.split(split_ratio=split_ratio)

        split_ratio = "string"
        with pytest.raises(ValueError):
            dataset.split(split_ratio=split_ratio)
コード例 #9
0
def main(config, model_filename):
    if not os.path.exists(config.output_dir):
        os.makedirs(config.output_dir)

    if not os.path.exists(config.cache_dir):
        os.makedirs(config.cache_dir)

    model_file = os.path.join(config.output_dir, model_filename)

    # Prepare the device
    gpu_ids = [int(device_id) for device_id in config.gpu_ids.split()]
    device, n_gpu = get_device(gpu_ids[0])
    if n_gpu > 1:
        n_gpu = len(gpu_ids)

    # Set Random Seeds
    random.seed(config.seed)
    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(config.seed)
        torch.backends.cudnn.deterministic = True

    # Prepare the data
    id_field = data.RawField()
    id_field.is_target = False
    text_field = data.Field(tokenize='spacy', lower=True, include_lengths=True)
    label_field = data.LabelField(dtype=torch.long)

    train_iterator, dev_iterator, test_iterator = load_data(
        config.data_path, id_field, text_field, label_field,
        config.train_batch_size, config.dev_batch_size, config.test_batch_size,
        device, config.glove_word_file, config.cache_dir)

    # Word Vector
    word_emb = text_field.vocab.vectors

    if config.model_name == "GAReader":
        from Baselines.GAReader.GAReader import GAReader
        model = GAReader(config.glove_word_dim, config.output_dim,
                         config.hidden_size, config.rnn_num_layers,
                         config.ga_layers, config.bidirectional,
                         config.dropout, word_emb)
        print(model)

    # optimizer = optim.Adam(model.parameters(), lr=config.lr)
    optimizer = optim.SGD(model.parameters(), lr=config.lr)
    criterion = nn.CrossEntropyLoss()

    model = model.to(device)
    criterion = criterion.to(device)

    if config.do_train:
        train(config.epoch_num, model, train_iterator, dev_iterator, optimizer,
              criterion, ['0', '1', '2', '3', '4'], model_file, config.log_dir,
              config.print_step, config.clip)

    model.load_state_dict(torch.load(model_file))

    test_loss, test_acc, test_report = evaluate(model, test_iterator,
                                                criterion,
                                                ['0', '1', '2', '3', '4'])
    print("-------------- Test -------------")
    print("\t Loss: {} | Acc: {} | Macro avg F1: {} | Weighted avg F1: {}".
          format(test_loss, test_acc, test_report['macro avg']['f1-score'],
                 test_report['weighted avg']['f1-score']))
コード例 #10
0
# pip install spacy
# python -m spacy download en_core_web_sm

from torchtext.legacy import data
from tqdm import tqdm
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, confusion_matrix

device = "cpu"

# dataset
LABEL = data.LabelField()
POST = data.Field(tokenize="spacy",
                  lower=True,
                  tokenizer_language="en_core_web_sm")
fields = [("body", POST), ("label", LABEL)]
dataset = data.TabularDataset(path="pytorch_data.csv",
                              format="CSV",
                              fields=fields)
train, test = dataset.split(split_ratio=[0.8, 0.2])

# vocabulary
POST.build_vocab(train, max_size=10000)  # , vectors = 'glove.6B.200d')
LABEL.build_vocab(train)  # fixes `"LabelField" has no attribute "vocab"`

# data loaders
train_iterator, test_iterator = data.BucketIterator.splits(
    (train, test),
    batch_size=32,
    device=device,