Exemple #1
0
def load_dataset(batch_size):
    spacy_de = spacy.load('de')
    spacy_en = spacy.load('en')
    url = re.compile('(<url>.*</url>)')

    def tokenize_de(text):
        return [tok.text for tok in spacy_de.tokenizer(url.sub('@URL@', text))]

    def tokenize_en(text):
        return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))]

    DE = Field(tokenize=tokenize_de, include_lengths=True,
               init_token='<sos>', eos_token='<eos>')
    EN = Field(tokenize=tokenize_en, include_lengths=True,
               init_token='<sos>', eos_token='<eos>')
    #train, val, test = Multi30k.splits(exts=('.de', '.en'), fields=(DE, EN))
    
    train, val, test = TranslationDataset.splits(      
          path = '.data/multi30k',  
          exts = ['.de', '.en'],   
          fields = [('src', DE), ('trg', EN)],
          train = 'train', 
          validation = 'val', 
          test = 'test2016')
    DE.build_vocab(train.src, min_freq=2)
    EN.build_vocab(train.trg, max_size=10000)
    train_iter, val_iter, test_iter = BucketIterator.splits(
            (train, val, test), batch_size=batch_size, repeat=False)
    return train_iter, val_iter, test_iter, DE, EN
def get_data(args):
    # batch
    batch_size = args.batch
    device = "cuda" if (torch.cuda.is_available() and args.use_cuda) else "cpu"

    # set up fields
    src = Field(
        sequential=True,
        tokenize=str.split,
        use_vocab=True,
        lower=True,
        include_lengths=False,
        fix_length=args.max_length,  # fix max length
        batch_first=True)
    trg = Field(
        sequential=True,
        tokenize=str.split,
        use_vocab=True,
        init_token='<s>',
        eos_token='</s>',
        lower=True,
        fix_length=args.max_length,  # fix max length
        batch_first=True)

    print('set up fields ... done')

    if args.data_type == "koen":

        train, valid, test = TranslationDataset.splits(('.ko', '.en'),
                                                       (src, trg),
                                                       train='train',
                                                       validation='valid',
                                                       test='test',
                                                       path=args.root_dir)

        # build the vocabulary
        src.build_vocab(train.src, min_freq=args.min_freq)
        trg.build_vocab(train.trg, min_freq=args.min_freq)

        # save the voabulary
        src_vocabs = src.vocab.stoi
        trg_vocabs = trg.vocab.stoi

        with open('./src_vocabs.pkl', 'wb') as f:
            pickle.dump(src_vocabs, f, pickle.HIGHEST_PROTOCOL)
        with open('./trg_vocabs.pkl', 'wb') as f:
            pickle.dump(trg_vocabs, f, pickle.HIGHEST_PROTOCOL)

    else:
        assert False, "Please Insert data_type"

    train_iter, valid_iter, test_iter = BucketIterator.splits(
        (train, valid, test), batch_sizes=([batch_size] * 3), device=device)

    return (src, trg), (train, valid, test), (train_iter, valid_iter,
                                              test_iter)
    def __init__(self, module_name, train_bs, eval_bs, device, log):
        self.module_name = module_name

        # split_chars = lambda x: list("".join(x.split()))
        split_chars = lambda x: list(x)  # keeps whitespaces

        source = Field(tokenize=split_chars,
                       init_token='<sos>',
                       eos_token='<eos>',
                       batch_first=True)

        target = Field(tokenize=split_chars,
                       init_token='<sos>',
                       eos_token='<eos>',
                       batch_first=True)

        log("Loading FULL datasets ...")
        folder = os.path.join(DATASET_TARGET_DIR, module_name)
        train_dataset, eval_dataset, _ = TranslationDataset.splits(
            path=folder,
            root=folder,
            exts=(INPUTS_FILE_ENDING, TARGETS_FILE_ENDING),
            fields=(source, target),
            train=TRAIN_FILE_NAME,
            validation=EVAL_FILE_NAME,
            test=EVAL_FILE_NAME)

        log("Building vocab ...")
        source.build_vocab(train_dataset)
        target.vocab = source.vocab

        log("Creating iterators ...")
        train_iterator = Iterator(dataset=train_dataset,
                                  batch_size=train_bs,
                                  train=True,
                                  repeat=True,
                                  shuffle=True,
                                  device=device)

        eval_iterator = Iterator(dataset=eval_dataset,
                                 batch_size=eval_bs,
                                 train=False,
                                 repeat=False,
                                 shuffle=False,
                                 device=device)

        self.train_dataset = train_dataset
        self.eval_dataset = eval_dataset
        self.train_iterator = train_iterator
        self.eval_iterator = eval_iterator
        self.source = source
        self.target = target
def load_dataset(batch_size, device):
    """
    Load the dataset from the files into iterator and initialize the vocabulary
    :param batch_size
    :param device
    :return: source and data iterators
    """
    source = Field(tokenize=tokenize_en,
                   init_token='<sos>',
                   eos_token='<eos>',
                   lower=True)

    train_data, valid_data, test_data = TranslationDataset.splits(
        path=DATA_FOLDER,
        exts=(POSITIVE_FILE_EXTENSION, NEGATIVE_FILE_EXTENSION),
        fields=(source, source))
    source.build_vocab(train_data, min_freq=5)
    return source, BucketIterator.splits((train_data, valid_data, test_data),
                                         shuffle=True,
                                         batch_size=batch_size,
                                         device=device)
Exemple #5
0
def get_data(path='data/'):
    SRC = Field(tokenize=tokenize_cn,
                init_token='<sos>',
                eos_token='<eos>',
                pad_token='<pad>',
                unk_token='<unk>',
                lower=True)
    TRG = Field(tokenize=tokenize_en,
                init_token='<sos>',
                eos_token='<eos>',
                pad_token='<pad>',
                unk_token='<unk>',
                lower=True)

    train_data, valid_data, test_data = TranslationDataset.splits(
        path=path,
        train='train',
        validation='val',
        test='test',
        exts=('.cn', '.en'),
        fields=(SRC, TRG))

    print("train: {}".format(len(train_data.examples)))
    print("valid: {}".format(len(valid_data.examples)))
    print("test: {}".format(len(test_data.examples)))

    SRC.build_vocab(train_data, min_freq=params.MIN_FREQ)
    TRG.build_vocab(train_data, min_freq=params.MIN_FREQ)

    print("源语言词表大小: {}".format(len(SRC.vocab)))
    print("目标语言词表大小: {}".format(len(TRG.vocab)))
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size=params.BATCH_SIZE,
        device=device)

    return train_iterator, valid_iterator, test_iterator, SRC, TRG
Exemple #6
0
def load_dataset(args):
    def tokenzie_zhcha(text):
        #return [tok for tok in re.sub('\s','',text).strip()]
        return [tok for tok in text.strip()]

    def tokenzie_zhword(text):
        return [tok for tok in text.strip().split()]

    def tokenzie_ticha(text):
        return [tok for tok in text.strip().split()]

    def tokenzie_tiword(text):
        return [tok for tok in text.strip().split()]

    ZH_CHA = Field(tokenize=tokenzie_zhcha,
                   include_lengths=True,
                   init_token='<sos>',
                   eos_token='<eos>')

    ZH_WORD = Field(tokenize=tokenzie_zhword,
                    include_lengths=True,
                    init_token='<sos>',
                    eos_token='<eos>')

    Ti_CHA = Field(tokenize=tokenzie_ticha,
                   include_lengths=True,
                   init_token='<sos>',
                   eos_token='<eos>')

    Ti_WORD = Field(tokenize=tokenzie_tiword,
                    include_lengths=True,
                    init_token='<sos>',
                    eos_token='<eos>')

    #pdb.set_trace()

    #According to training mode, load data
    if args.mode == 'ctc':
        exts = (args.extension.split()[0], args.extension.split()[1])
        train, val, test = Trans.splits(path=args.path,
                                        exts=exts,
                                        fields=(Ti_CHA, Ti_WORD),
                                        train=args.train,
                                        validation=args.valid,
                                        test=args.test)

        Ti_CHA.build_vocab(train.src)
        Ti_WORD.build_vocab(train.trg)

        train_iter, val_iter, test_iter = BucketIterator.splits(
            (train, val, test), batch_size=args.batch_size, repeat=False)
        return train_iter, val_iter, test_iter, Ti_CHA, Ti_WORD

    elif args.mode == 'nmt':
        exts = (args.extension.split()[0], args.extension.split()[1])
        train, val, test = Trans.splits(path=args.path,
                                        exts=exts,
                                        fields=(Ti_WORD, ZH_WORD),
                                        train=args.train,
                                        validation=args.valid,
                                        test=args.test)

        Ti_WORD.build_vocab(train.src, max_size=50000)
        ZH_WORD.build_vocab(train.trg, max_size=50000)

        train_iter, val_iter, test_iter = BucketIterator.splits(
            (train, val, test), batch_size=args.batch_size, repeat=False)
        return train_iter, val_iter, test_iter, Ti_WORD, ZH_WORD

    elif args.mode == 'nmt_char':
        exts = (args.extension.split()[0], args.extension.split()[1])
        train, val, test = Trans.splits(path=args.path,
                                        exts=exts,
                                        fields=(Ti_CHA, ZH_CHA),
                                        train=args.train,
                                        validation=args.valid,
                                        test=args.test)

        Ti_CHA.build_vocab(train.src)
        ZH_CHA.build_vocab(train.trg)

        train_iter, val_iter, test_iter = BucketIterator.splits(
            (train, val, test), batch_size=args.batch_size, repeat=False)
        return train_iter, val_iter, test_iter, Ti_CHA, ZH_CHA

    elif args.mode == 'combine':
        exts = (args.extension.split()[0], args.extension.split()[1])
        train, val, test = Trans.splits(path=args.path,
                                        exts=exts,
                                        fields=(Ti_CHA, ZH_WORD),
                                        train=args.train,
                                        validation=args.valid,
                                        test=args.test)

        Ti_CHA.build_vocab(train.src)
        ZH_WORD.build_vocab(train.trg, max_size=50000)

        train_iter, val_iter, test_iter = BucketIterator.splits(
            (train, val, test), batch_size=args.batch_size, repeat=False)
        return train_iter, val_iter, test_iter, Ti_CHA, ZH_WORD

    elif args.mode == 'refine_ctc':
        exts = (args.extension.split()[0], args.extension.split()[1])
        train, val, test = Trans.splits(path=args.path,
                                        exts=exts,
                                        fields=(Ti_CHA, Ti_WORD),
                                        train=args.train,
                                        validation=args.valid,
                                        test=args.test)

        Ti_CHA.build_vocab(train.src)
        Ti_WORD.build_vocab(train.trg, max_size=50000)

        train_iter, val_iter, test_iter = BucketIterator.splits(
            (train, val, test), batch_size=args.batch_size, repeat=False)
        return train_iter, val_iter, test_iter, Ti_CHA, Ti_WORD

    elif args.mode == 'update_twoLoss':
        exts = (args.extension.split()[0], args.extension.split()[1],
                args.extension.split()[2])
        train, val, test, = mydataset.splits(path=args.path,
                                             exts=exts,
                                             fields=(Ti_CHA, ZH_WORD, Ti_WORD),
                                             train=args.train,
                                             validation=args.valid,
                                             test=args.test)
        Ti_CHA.build_vocab(train.src)
        ZH_WORD.build_vocab(train.trg, max_size=50000)
        Ti_WORD.build_vocab(train.ctc, max_size=50000)

        train_iter, val_iter, test_iter = BucketIterator.splits(
            (train, val, test), batch_size=args.batch_size, repeat=False)

        return train_iter, val_iter, test_iter, Ti_CHA, ZH_WORD, Ti_WORD
Exemple #7
0
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

SRC = Field(tokenize=None, init_token='<sos>', eos_token='<eos>', lower=True)

TRG = Field(tokenize=None, init_token='<sos>', eos_token='<eos>', lower=True)

myData = TranslationDataset('./E_V/train', ('.en', '.vi'), (SRC, TRG))

train_data, test_data = myData.splits(exts=('.en', '.vi'),
                                      fields=(SRC, TRG),
                                      path="./E_V/",
                                      train='train',
                                      validation=None,
                                      test='tst2012')
vocabData = TranslationDataset('./E_V/vocab', ('.en', '.vi'), (SRC, TRG))
print(f"Number of training examples: {len(train_data.examples)}")
# # print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

SRC.build_vocab(train_data, min_freq=3)
TRG.build_vocab(train_data, min_freq=3)

print(f"Unique tokens in source (de) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}")

device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
Exemple #8
0
    def __init__(self,
                 data_dir: str,
                 packed: bool,
                 vocab_max_sizes: Tuple[int, int],
                 vocab_min_freqs: Tuple[int, int],
                 batch_sizes: Tuple[int, int, int],
                 test: bool = False):
        print(f"Creating DataLoader for {'testing' if test else 'training'}")

        # Rebuild the vocabs during testin, as the saved can be build from a different config
        if test:
            vocab_exists = False
        else:
            vocab_exists = has_vocabs(data_dir, vocab_max_sizes,
                                      vocab_min_freqs)

        # Define torch text fields for processing text
        if vocab_exists:
            print("Loading fields and vocabs...")
            SRC, TRG = load_vocabs(data_dir, vocab_max_sizes, vocab_min_freqs)
        else:
            print("Building fields...")

            # Include the sentence length for source
            SRC = Field(tokenize=tokenize_diff,
                        init_token='<sos>',
                        eos_token='<eos>',
                        include_lengths=packed,
                        lower=True)

            TRG = Field(tokenize=tokenize_msg,
                        init_token='<sos>',
                        eos_token='<eos>',
                        lower=True)

        print("Loading commit data...")
        train_data, valid_data, test_data = TranslationDataset.splits(
            exts=('.diff', '.msg'),
            train='TrainingSet/train.26208',
            validation='TrainingSet/valid.3000',
            test='TestSet/test.3000',
            fields=(SRC, TRG),
            path=data_dir)

        if not vocab_exists:
            # Build vocabs
            print("Building vocabulary...")
            specials = ['<unk>', '<pad>', '<sos>', '<eos>']
            SRC.build_vocab(train_data,
                            min_freq=vocab_min_freqs[0],
                            max_size=vocab_max_sizes[0],
                            specials=specials)
            TRG.build_vocab(train_data,
                            min_freq=vocab_min_freqs[1],
                            max_size=vocab_max_sizes[1],
                            specials=specials)

            if not test:
                save_vocabs(data_dir, SRC, TRG, vocab_max_sizes,
                            vocab_min_freqs)

        print(f"Number of training examples: {len(train_data.examples)}")
        print(f"Number of validation examples: {len(valid_data.examples)}")
        print(f"Number of testing examples: {len(test_data.examples)}")
        print(
            f"Unique tokens in source (diff) training vocabulary: {len(SRC.vocab)}"
        )
        print(
            f"Unique tokens in target (msg) training vocabulary: {len(TRG.vocab)}"
        )

        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        # Bucketing (minimizes the amount of padding by grouping similar length sentences)
        # Sort the sequences based on their non-padded length
        train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
            (train_data, valid_data, test_data),
            batch_sizes=batch_sizes,
            sort_within_batch=packed,
            sort_key=lambda x: len(x.src) if packed else None,
            device=device)

        super().__init__(train_iterator, valid_iterator, test_iterator, SRC,
                         TRG, tokenize_diff, tokenize_msg)
Exemple #9
0
    def __init__(self,
                 module_name,
                 train_bs,
                 eval_bs,
                 device,
                 vocab=None,
                 base_folder=None,
                 train_name=None,
                 eval_name=None,
                 x_ext=None,
                 y_ext=None,
                 tokens=None,
                 specials=None,
                 tokenizer=None,
                 sort_within_batch=None,
                 shuffle=None):

        self.module_name = module_name

        # split_chars = lambda x: list("".join(x.split()))
        split_chars = lambda x: list(x)  # keeps whitespaces

        if not tokenizer:
            tokenizer = split_chars

        # NOTE: on Jul-20-2020, removed fix_length=200 since it forces
        # all batches to be of size (batch_size, 200) which
        # really wastes GPU memory
        source = Field(tokenize=tokenizer,
                       init_token='<sos>',
                       eos_token='<eos>',
                       batch_first=True)

        target = Field(tokenize=tokenizer,
                       init_token='<sos>',
                       eos_token='<eos>',
                       batch_first=True)

        base_folder = os.path.expanduser(base_folder)

        folder = os.path.join(base_folder, module_name)

        # fix slashes
        folder = os.path.abspath(folder)

        print("loading FULL datasets from folder={}".format(folder))

        train_dataset, eval_dataset, _ = TranslationDataset.splits(
            path=folder,
            root=folder,
            exts=(x_ext, y_ext),
            fields=(source, target),
            train=train_name,
            validation=eval_name,
            test=eval_name)

        if vocab:
            print("Setting vocab to prebuilt file...")
            source.vocab = vocab
            target.vocab = vocab
        elif tokens:
            print("Building vocab from tokens...")
            #source.build_vocab(tokens, specials)
            counter = Counter(tokens)
            source.vocab = source.vocab_cls(counter, specials=specials)
            target.vocab = source.vocab
        else:
            print("Building vocab from TRAIN and EVAL datasets...")
            source.build_vocab(train_dataset, eval_dataset)
            target.vocab = source.vocab

        print("Creating iterators ...")
        do_shuffle = True if shuffle is None else shuffle
        train_iterator = Iterator(dataset=train_dataset,
                                  batch_size=train_bs,
                                  train=True,
                                  repeat=True,
                                  shuffle=do_shuffle,
                                  sort_within_batch=sort_within_batch,
                                  device=device)

        eval_iterator = Iterator(dataset=eval_dataset,
                                 batch_size=eval_bs,
                                 train=False,
                                 repeat=False,
                                 shuffle=False,
                                 sort_within_batch=sort_within_batch,
                                 device=device)

        self.train_dataset = train_dataset
        self.eval_dataset = eval_dataset

        self.train_iterator = train_iterator
        self.eval_iterator = eval_iterator

        self.source = source
        self.target = target
Exemple #10
0
MAX_LEN = 100

from torchtext.datasets import TranslationDataset, Multi30k
ROOT = './'
Multi30k.download(ROOT)

SRC = data.Field(tokenize=tokenize_de, pad_token=BLANK_WORD)
TGT = data.Field(tokenize=tokenize_en,
                 init_token=BOS_WORD,
                 eos_token=EOS_WORD,
                 pad_token=BLANK_WORD)

(trnset, valset,
 testset) = TranslationDataset.splits(path='./Multi30k/multi30k',
                                      exts=['.en', '.de'],
                                      fields=[('src', SRC), ('trg', TGT)],
                                      test='test2016')

#list(enumerate(testset))

import pandas as pd

df = pd.read_csv("./SQuAD_csv/train_SQuAD.csv", sep=';', header=None)

df = df.iloc[1:, :]
df = df.iloc[:, [1, 2]]

from sklearn.model_selection import train_test_split
train, val = train_test_split(df, test_size=0.1)
train.to_csv("train.csv", index=False)
val.to_csv("val.csv", index=False)
Exemple #11
0

SRC = Field(tokenize=tokenize,
            init_token='<sos>',
            eos_token='<eos>',
            lower=True)
TRG = Field(tokenize=tokenize,
            init_token='<sos>',
            eos_token='<eos>',
            lower=True)

fields, exts = (SRC, TRG), ('.ig', '.en')
train_data, validate_data, test_data = TranslationDataset.splits(
    fields=fields,
    exts=exts,
    path=os.path.join('./..', 'data'),
    train='train',
    validation='val',
    test='test')

print(f"{'Training examples':>20s}: {len(train_data.examples)}")
print(f"{'Validation examples':>20s}: {len(validate_data.examples)}")
print(f"{'Testing examples':>20s}: {len(test_data.examples)}")

print(vars(train_data.examples[0]))

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
data_path_inp = 'enghin/train.en'
data_path_inp_val = 'enghin/dev.en'
data_path_tar = 'enghin/train.hi'
data_path_tar_val = 'enghin/dev.hi'
data_path_inp_test = 'enghin/test.en'
data_path_tar_test = 'enghin/test.hi'

torch.backends.cudnn.deterministic = True

def tokenize(text):
    return text.split()

src_field = Field(tokenize=tokenize, lower=True, init_token='<SOL>', eos_token='<EOL>')
trg_field = Field(tokenize=tokenize, lower=True, init_token='<SOL>', eos_token='<EOL>')

train_data, valid_data, test_data = TranslationDataset.splits(exts=(".en",".hi"), fields=(src_field, trg_field), path="", train="train_med", validation="dev", test="test")

src_field.build_vocab(train_data, min_freq=2, max_size=10000)
trg_field.build_vocab(train_data, min_freq=2, max_size=10000)
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=device)


device = 'cuda'

class Encoder(nn.Module):
    def __init__(self, inp_dim, embed_dim, encoder_hidden_dim, decoder_hidden_dim, dropout):
        super().__init__()
        
        self.inp_dim = inp_dim
        self.embed_dim = embed_dim
def load_dataset(
    dataset_name="SQUAD",
    tokenizer=word_tokenizer,
    init_token="<sos>",
    eos_token="<eos>",
    lower=True,
    use_glove=True,
    source_vocab=45000,
    target_vocab=28000,
    batch_size=VANILLA_SEQ2SEQ["BATCHSIZE"],
):
    """
    Method Loads the dataset from location and returns three iterators and SRC and TRG fields
    """
    logger.debug("Loading {} dataset".format(dataset_name))
    SRC = data.Field(
        tokenize=tokenizer,
        init_token=init_token,
        eos_token=eos_token,
        lower=True,
        include_lengths=True,
    )
    TRG = data.Field(
        tokenize=tokenizer, init_token=init_token, eos_token=eos_token, lower=True
    )

    location = os.path.join(FILE_PATH, dataset_name)

    logger.debug("Loading from location: {}".format(location))
    start_time = time.time()
    train_dataset, valid_dataset, test_dataset = TranslationDataset.splits(
        exts=(".paragraphs", ".questions"),
        fields=(SRC, TRG),
        path=location,
        train="train",
        validation="valid",
        test="test",
    )

    logger.debug(
        "Number of Samples: Training = {} | Validation = {} | Testing = {}".format(
            len(train_dataset.examples),
            len(valid_dataset.examples),
            len(test_dataset.examples),
        )
    )
    logger.debug("Time Taken: {:.6f}s".format(time.time() - start_time))
    logger.debug("Building Vocab")

    start_time = time.time()
    if use_glove:
        logger.debug("Using Glove vectors")
        SRC.build_vocab(train_dataset, max_size=source_vocab, vectors="glove.6B.300d")
        TRG.build_vocab(train_dataset, max_size=target_vocab, vectors="glove.6B.300d")
    else:
        SRC.build_vocab(train_dataset, max_size=source_vocab)
        TRG.build_vocab(train_dataset, max_size=target_vocab)

    logger.info(
        "Vocabulary Built! Source Tokens = {} | Target Tokens = {}  \nCreating Iterators".format(
            len(SRC.vocab), len(TRG.vocab)
        )
    )
    logger.debug("Time Taken: {:.6f}s".format(time.time() - start_time))

    return (
        BucketIterator.splits(
            (train_dataset, valid_dataset, test_dataset),
            batch_size=batch_size,
            sort_within_batch=True,
            sort_key=lambda x: len(x.src),
            device=device,
        ),
        SRC,
        TRG,
    )
Exemple #14
0
    return text.split()


SRC = Field(tokenize=tokenize_en,
            init_token='<sos>',
            eos_token='<eos>',
            lower=True)

TRG = Field(tokenize=tokenize_hi,
            init_token='<sos>',
            eos_token='<eos>',
            lower=True)

train_data, valid_data, test_data = TranslationDataset.splits(
    path='IITB_small',
    validation='dev',
    exts=('.en', '.hi'),
    fields=(SRC, TRG))

print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

vars(train_data.examples[0])

SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2, specials=['<pad>', '<sop>', '<eop>'])

print(f"Unique tokens in source (en) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (hi) vocabulary: {len(TRG.vocab)}")
Exemple #15
0
def dataset_construction_from_raw_dataset(
    src_language: str,
    trg_language: str,
    path: str,
    filenames_exts: Tuple[str, str],
    min_freq: int = 1,
    train_filename: str = 'train',
    valid_filename: str = 'val',
    test_filename: str = 'test',
    init_token: Optional[str] = '<sos>',
    eos_token: Optional[str] = '<eos>',
) -> Tuple[TranslationDataset, TranslationDataset, TranslationDataset, Field,
           Field]:
    """
    This function construct the train, validation and test datasets starting from raw files. It also builds the
    vocabulary from the training dataset. Raw files should be text files where each line correspond to a sentence
    in the respective language, and the extension should be language dependent. For example, if you have an English
    and German dataset, the train file should be called 'train.en' and 'train.de' respectively.
    :param src_language: the language of the source sequences, to be passed onto the Field tokenizer_language argument.
           Follows spacy's language abbreviations, i.e. 'en' for English, 'de' for German etc.
           See https://spacy.io/usage/models#languages for supporterd languages and their abbreviations.
    :param trg_language: the language of the target sequences, to be passed onto the Field tokenizer_language argument.
           Same conventions as for src_language (see above).
    :param path: the folder where the raw files are stored.
    :param filenames_exts: a tuple containing the extension to path for source and target language respectively.
           For German (source) and English (target), this would be filenames_exts = ('.de', '.en')
    :param min_freq: the minimum frequency a word must have, in the training corpus, in order to be included in
           the vocabulary. Default: 1.
    :param train_filename: the prefix of the train dataset (without extension). Default: 'train'.
    :param valid_filename: the prefix of the validation dataset (without extension). Default: 'val'.
    :param test_filename: the prefix of the test dataset (without extension). Default: 'test'.
    :param init_token: a token that will be prepended to every sentence, or None for no initial token. Default: '<sos>'.
    :param eos_token: a token that will be appended to every sentence, or None for no end-of-sentence token.
           Default: '<eos>'.
    :return: train: the training dataset, converted to a torchtest.datasets.TranslationDataset
             valid: the validation dataset, converted to a torchtest.datasets.TranslationDataset
             test: the test dataset, converted to a torchtest.datasets.TranslationDataset
             src_field: the Field object for the source dataset. Defines a datatype together with instructions for
             converting to Tensor. This might be needed if we want to convert new text to integers or viceversa using
             the vocabulary built with our input training corpus.
             trg_field: the Field object for the target dataset. See src_field for a description.
    """
    src_field = Field(
        sequential=True,
        use_vocab=True,
        init_token=init_token,
        eos_token=eos_token,
        tokenize='spacy',
        tokenizer_language=src_language,
        batch_first=True,
        is_target=False,
    )
    trg_field = Field(sequential=True,
                      use_vocab=True,
                      init_token=init_token,
                      eos_token=eos_token,
                      tokenize='spacy',
                      tokenizer_language=trg_language,
                      batch_first=True,
                      is_target=True)

    train, valid, test = TranslationDataset.splits(
        exts=filenames_exts,
        fields=(src_field, trg_field),
        path=path,
        train=
        train_filename,  # these will be suffixed with the extensions given in the exts tuple.
        validation=valid_filename,
        test=test_filename)

    src_field.build_vocab(train, min_freq=min_freq)
    trg_field.build_vocab(train, min_freq=min_freq)

    return train, valid, test, src_field, trg_field
    spacy_de = nl_core_news_sm.load()
    spacy_en = en_core_web_sm.load()

    SRC = Field(tokenize=tokenize_de,
                init_token='<sos>',
                eos_token='<eos>',
                lower=True)
    TRG = Field(tokenize=tokenize_en,
                init_token='<sos>',
                eos_token='<eos>',
                lower=True)

    train, valid, test = TranslationDataset.splits(path='./data/multi30k/',
                                                   exts=['.de', '.en'],
                                                   fields=[('src', SRC),
                                                           ('trg', TRG)],
                                                   train='train',
                                                   validation='val',
                                                   test='test2016')
    print(vars(train.examples[0]))
    SRC.build_vocab(train, min_freq=2)
    TRG.build_vocab(train, min_freq=2)

    BATCH_SIZE = 128

    train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
        (train, valid, test), batch_size=BATCH_SIZE, repeat=False)

    INPUT_DIM = len(SRC.vocab)
    OUTPUT_DIM = len(TRG.vocab)
    ENC_EMB_DIM = 256
Exemple #17
0
def load_dataset(batch_size):
    spacy_de = spacy.load(
        'de')  #run it on your env or virtrual env:#python -m spacy download de
    spacy_en = spacy.load(
        'en')  #run it on your env or virtrual env:#python -m spacy download en
    url = re.compile('(<url>.*</url>)')

    def tokenize_de(text):
        return [tok.text for tok in spacy_de.tokenizer(url.sub('@URL@', text))]

    def tokenize_en(text):
        return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))]

    # create dataset according to Field object.
    # Field define the basic token and tokenize.
    # Field can create vocab.
    # If you don't define init_token and eos_token, you will not get these token when you get training batch data from train_iter
    # Because you define the init_token and eos_token in here, you can get init_token + sentence + eos_token when you create train, val, test from TranslationDataset.splits
    DE = Field(tokenize=tokenize_de,
               include_lengths=True,
               init_token='<sos>',
               eos_token='<eos>')
    EN = Field(tokenize=tokenize_en,
               include_lengths=True,
               init_token='<sos>',
               eos_token='<eos>')

    #you can find: len(val.examples)=1014; len(test.examples)=1000; len(train.examples)=29000 in Multi30k.splits...
    #train, val, test = Multi30k.splits(exts=('.de', '.en'), fields=(DE, EN))

    #I download the data and read it directly:
    #if your file name is not the same as defualt, you must change the function input parameter: train='train', validation='val', test='test'
    #exts parameter is the data file ext name.
    #So the data file depends on the parameter:path+(train\validation\test)+exts
    train, val, test = TranslationDataset.splits(path='./data/',
                                                 exts=('.de', '.en'),
                                                 fields=(DE, EN))

    #build vocabury
    #You can find one word from: DE.vocab.itos[0], it will depend on the order of frenquency
    #You can also find the index of word from: DE.vocab.stoi['word name']
    #It will automatically create the '<pad>' into vocab even you never use it. The '<pad>' sometimes only be used after creating iterators.
    #It is the same to unkonw_token '<pad>'. If you want: init_token='<sos>', eos_token='<eos>',
    #you need to give a arguement in creating the Field object.
    DE.build_vocab(
        train.src, min_freq=2
    )  # you can just use DE.build_vocab(train, min_freq=2), but not: DE.build_vocab(train.trg, min_freq=2)
    EN.build_vocab(train.trg, max_size=10000
                   )  # you can just use EN.build_vocab(train, max_size=10000)

    # Create batch and make the length of every sentence in one batch become the same
    # If repeat=True, program will forever run in: 'for b, batch in enumerate(train_iter):'
    train_iter, val_iter, test_iter = BucketIterator.splits(
        (train, val, test), batch_size=batch_size, repeat=False)
    return train_iter, val_iter, test_iter, DE, EN


# import re
# import spacy
# import torch
# from torchtext.data import Field, BucketIterator
# from torchtext.datasets import Multi30k, TranslationDataset

# spacy_de = spacy.load('de')#run it on your env or virtrual env:#python -m spacy download de
# spacy_en = spacy.load('en')#run it on your env or virtrual env:#python -m spacy download en
# url = re.compile('(<url>.*</url>)')

# def tokenize_de(text):
#     return [tok.text for tok in spacy_de.tokenizer(url.sub('@URL@', text))]

# def tokenize_en(text):
#     return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))]

# DE = Field(tokenize=tokenize_de, include_lengths=True,
#             init_token='<sos>', eos_token='<eos>')
# EN = Field(tokenize=tokenize_en, include_lengths=True,
#             init_token='<sos>', eos_token='<eos>')

# #you can find: len(val.examples)=1014; len(test.examples)=1000; len(train.examples)=29000 in Multi30k.splits...
# #train, val, test = Multi30k.splits(exts=('.de', '.en'), fields=(DE, EN))

# #I download the data and read it directly:
# #if your file name is not the same as defualt, you must change the function input parameter: train='train', validation='val', test='test'
# #exts parameter is the data file ext name.
# #So the data file depends on the parameter:path+(train\validation\test)+exts
# train, val, test = TranslationDataset.splits(path='./data2/',exts=('.de', '.en'), fields=(DE, EN))

# #build vocabury
# #You can find one word from: DE.vocab.itos[0], it will depend on the order of frenquency
# #You can also find the index of word from: DE.vocab.stoi['word name']

# DE.build_vocab(train, min_freq=2)
# EN.build_vocab(train, max_size=10000)

# for i in range(5):
#     print(DE.vocab.itos[i])

# train_iter, val_iter, test_iter = BucketIterator.splits(
#             (train, val, test), batch_size=2, repeat=False, sort=True, sort_within_batch=False)
# DE.vocab.stoi
# for i in range(5):
#     print(DE.vocab.itos[i])

# for i in range(len(EN.vocab)):
#     print(EN.vocab.itos[i])

# for e in range(3):
#     for b, batch in enumerate(train_iter):
#             src, len_src = batch.src
#             trg, len_trg = batch.trg
#             tensorToCsv2D(src,name='src',path='/home/yj/Documents/Python/Github/seq2seq/data2/gan.txt')
#             tensorToCsv2D(len_src,name='len_src',path='/home/yj/Documents/Python/Github/seq2seq/data2/gan.txt')
#             tensorToCsv2D(trg,name='trg',path='/home/yj/Documents/Python/Github/seq2seq/data2/gan.txt')
#             tensorToCsv2D(len_trg,name='len_trg',path='/home/yj/Documents/Python/Github/seq2seq/data2/gan.txt')

# import numpy
# def tensorToCsv2D(tensor,name='defualt',path=None,token=','):

#     def get_variable_name(variable):
#         callers_local_vars = inspect.currentframe().f_back.f_locals.items()
#         return [var_name for var_name, var_val in callers_local_vars if var_val is variable]

#     name = ''.join(get_variable_name(tensor))

#     assert(path is not None)

#     z = tensor.numpy().tolist()
#     if len(numpy.shape(z)) == 2:
#         with open(path,'a') as f:
#             f.write(name)
#             f.write('\r')
#             for i in range(numpy.shape(z)[0]):
#                 for j in range(numpy.shape(z)[1]):
#                     f.write(str(z[i][j]))
#                     f.write(token)
#                 f.write('\r')
#     elif len(numpy.shape(z)) == 1:
#         with open(path,'a') as f:
#             f.write(name)
#             f.write('\r')
#             for i in range(numpy.shape(z)[0]):
#                 f.write(str(z[i]))
#                 f.write(token)
#             f.write('\r')

# tensorToCsv2D(src,name='src',path='/home/yj/Documents/Python/Github/seq2seq/data/gan.txt')
# tensorToCsv2D(len_src,name='len_src',path='/home/yj/Documents/Python/Github/seq2seq/data/gan.txt')
# tensorToCsv2D(trg,name='trg',path='/home/yj/Documents/Python/Github/seq2seq/data/gan.txt')
# tensorToCsv2D(len_trg,name='len_trg',path='/home/yj/Documents/Python/Github/seq2seq/data/gan.txt')

# with open('/home/yj/Documents/Python/Github/seq2seq/data/gan.txt','w') as f:
#     f.write(str(src))
#     f.write(str(len_src))
#     f.write(str(trg))
#     f.write(str(len_trg))
# f
# z = src.numpy().tolist()
# z[0][0]
# len(numpy.shape(z))
# numpy.shape(z)[0]