Esempio n. 1
0
    def _createFields(self, min_occurance_freq):
        self.CAPTION_FIELD = data.ReversibleField(
            tokenize='spacy', init_token=self.start_token, 
            eos_token=self.end_token, pad_token=self.pad_token, lower=True, 
            batch_first=True, is_target=True, unk_token=UNKNOWN_TOKEN)

        self.INDEX_FIELD = data.Field(
            sequential=False, use_vocab=False, batch_first=True)

        if self.use_yt_categories:
            # preprocessing: if there is no category replace with -1 (unique number for dummy category)
            self.CATEGORY_FIELD = data.Field(
                sequential=False, use_vocab=False, batch_first=True, 
                preprocessing=data.Pipeline(lambda x: -1 if len(x) == 0 else int(float(x))))

            # filter the dataset if the a category is missing (31 -> 41 (count = 1 :()))
            self.filter_callback = lambda x: vars(x)['category_32'] != -1 and vars(x)['category_32'] != 31
        else:
            self.CATEGORY = None
            self.filter_callback = None

        if self.use_asr_subtitles:
            self.ASR_SUBTITLES_FIELD = data.ReversibleField(
                tokenize='spacy', init_token=self.start_token, 
                eos_token=self.end_token, pad_token=self.pad_token, lower=True, 
                batch_first=True, unk_token=UNKNOWN_TOKEN)
        else:
            self.ASR_SUBTITLES_FIELD = None
Esempio n. 2
0
def get_dataset_iter(args, data_name="MR"):
    print("Loading data...")
    TEXT = data.ReversibleField(lower=True,
                                include_lengths=True,
                                batch_first=True)
    LABEL = data.Field(sequential=False)
    if data_name == "MR":
        train, test = MR.splits(TEXT, LABEL)
    else:
        train, test = myset.splits(TEXT, LABEL)

    print("Building vocabulary...")
    TEXT.build_vocab(train)
    LABEL.build_vocab(train)

    # print(type(TEXT.vocab.stoi))
    train_iter, test_iter = data.BucketIterator.splits(
        (train, test),
        sort_key=lambda x: len(x.text),
        sort_within_batch=True,
        batch_size=args.batch_size,
        device=-1,
        repeat=False)
    args.embed_num = len(TEXT.vocab)
    args.class_num = len(LABEL.vocab) - 1
    print("Loading data finish...")
    return train_iter, test_iter
Esempio n. 3
0
def create_reversible_field(sql_vocab):
    sql_tokenizer = lambda x: x.split(Constants.SQL_SEPARATOR)
    sql = textdata.ReversibleField(tokenize=sql_tokenizer,
                                   init_token=Constants.BOS_WORD, eos_token=Constants.EOS_WORD,
                                   pad_token=Constants.PAD_WORD, unk_token=Constants.UNK_WORD)
    sql.vocab = sql_vocab

    return sql
Esempio n. 4
0
def main():
    tokenizer = MyTokenizer()

    TEXT = data.Field(sequential=True,
                      use_vocab=False,
                      tokenize=tokenizer.numbericalized_tokenize,
                      pad_token=0)
    SUMMARY = data.ReversibleField(sequential=True,
                                   init_token='<sos>',
                                   eos_token='<eos>')

    print('Data Loading...')
    train_data = data.TabularDataset(
        path='/home/yilin10945/summary/data/newsroom/train.200.json',
        format='json',
        fields={
            'text': ('text', TEXT),
            'summary': ('summary', SUMMARY)
        })
    SUMMARY.build_vocab(train_data, max_size=30000)
    #import pickle
    #pickle.dump((train_data, TEXT, SUMMARY), open('model/processed_data.pkl', 'wb'))
    print('Data Loaded!!!')

    hidden_size = 768
    vocab_size = len(SUMMARY.vocab)
    learning_rate = 0.0001
    n_epochs = 10
    batch_size = 16

    embedding = nn.Embedding(vocab_size, hidden_size)
    bert_model = BertModel.from_pretrained('bert-base-uncased')
    bert_model.eval()
    attn_decoder = LuongAttnDecoderRNN('general', embedding, hidden_size,
                                       vocab_size, 1, 0.1).to(device)

    decoder_optimizer = optim.Adam(attn_decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    print('Start Training...')
    for epoch in range(n_epochs):
        running_loss = 0
        step = 0
        for batch in tqdm.tqdm(
                data.BucketIterator(dataset=train_data,
                                    batch_size=batch_size)):
            loss = train(batch.text, batch.summary.to(device), bert_model,
                         attn_decoder, decoder_optimizer, criterion)
            running_loss += loss
            step += batch_size

            if step % 128 == 0:
                print(f'Step: {step}, Training Loss: {running_loss/step}')
                torch.save(attn_decoder.state_dict(), f'model/{step}.pt')

        epoch_loss = running_loss / len(train_data)
        print(f'Epoch: {epoch}, Training Loss: {epoch_loss}')
Esempio n. 5
0
def load_data(batch_size):
    #sentence_field = data.ReversibleField(lower=True)
    sentence_field = torchtextdata.ReversibleField(
        lower=False, sequential=True
    )  #We will take care of lowercasing after the character model
    labels_field = torchtextdata.Field(lower=False, sequential=True)
    [train_iter,
     dev_iter] = read_train_and_dev_splits(sentence_field, labels_field,
                                           batch_size)
    return [train_iter, dev_iter, sentence_field, labels_field]
Esempio n. 6
0
    def __init__(self, config):

        device = config.device

        # if tokenize
        if not config.tokenize:
            tokenize = None

        # fields
        self.TEXT = data.ReversibleField(batch_first=True,
                                         tokenize=tokenize,
                                         lower=True)
        self.LABEL = data.ReversibleField(sequential=False, unk_token=None)

        # data split
        self.train, self.dev, self.test = datasets.MultiNLI.splits(
            self.TEXT, self.LABEL)

        # build vocabs
        self.TEXT.build_vocab(self.train, self.dev, self.test)
        self.LABEL.build_vocab(self.train)

        # add word vector
        add_vocab_vectors(self.TEXT, config)

        # create iterators
        self.train_iter, self.dev_iter, self.test_iter = \
            data.BucketIterator.splits((self.train, self.dev, self.test),
                                        batch_sizes=(config.batch_size,
                                                    config.batch_size,
                                                    config.batch_size),
                                        device=device
                                        )
        self.train_iter.repeat = False
        self.dev_iter.repeat = False
        self.test_iter.repeat = False

        self.max_word_len = max([len(w) for w in self.TEXT.vocab.itos])
        self.vocab = self.TEXT.vocab

        config.n_embed = len(self.TEXT.vocab)
        config.d_out = len(self.LABEL.vocab)  # output size, num of classes
Esempio n. 7
0
def load_mentions_dataset(chat_dataset_path, csv_reader_params, word_embedding_file, fix_len = None):
    word_embedding_vectors = None
    if word_embedding_file:
        if word_embedding_file.endswith(".pt"):
            word_embedding_file = word_embedding_file[:-3]

        word_embedding_vectors = vocab.Vectors(
            word_embedding_file,
            cache = path.dirname(word_embedding_file))

    message_field = data.ReversibleField(sequential=True,
                                         # tokenize=heb_tokenize,
                                         fix_length = fix_len,
                                         init_token=config['SOS_TOKEN'],
                                         eos_token=config['EOS_TOKEN'],
                                         pad_first=False,
                                         include_lengths=True
                                         )

    fields = {
        'message': ('message', message_field)
    }

    dataset = data.TabularDataset(
        path=chat_dataset_path,
        format='csv',
        csv_reader_params=csv_reader_params,
        skip_header=False,
        fields=fields
    )

    if word_embedding_file:
        message_field.build_vocab(dataset, vectors=word_embedding_vectors)
    else:
        message_field.build_vocab(dataset)
    corpus_size = len(message_field.vocab)
    return message_field.vocab.vectors, corpus_size, dataset
Esempio n. 8
0
                        vocab.itos[ix.item() if hasattr(ix, "item") else ix]
                        for ix in ex
                    ]))
    return textlist


if __name__ == "__main__":
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    config = load_model_config(sys.argv[1])
    torch.manual_seed(42)
    torch.cuda.manual_seed(42)

    # Dataset format
    text_field = torch_data.ReversibleField(sequential=True,
                                            lower=True,
                                            use_vocab=True,
                                            include_lengths=True,
                                            fix_length=256,
                                            tokenize="spacy")
    label_field = torch_data.Field(sequential=False,
                                   use_vocab=False,
                                   is_target=True)
    example_template = [('document', text_field), ('question', text_field),
                        ('answer1', text_field), ('answer2', text_field),
                        ('correct', label_field)]

    if sys.argv[2] == "train":
        # Read dataset
        mcscript_train, mcscript_dev, mcscript_val = mcread.read_mcscript(
            config["dataset_dir"], example_template)

        # Construct vocalbulary
Esempio n. 9
0
def caption_iterator(cfg, batch_size, phase):
    print(f'Contructing caption_iterator for "{phase}" phase')
    spacy_en = spacy.load('en')

    def tokenize_en(txt):
        return [token.text for token in spacy_en.tokenizer(txt)]

    CAPTION = data.ReversibleField(tokenize='spacy',
                                   init_token=cfg.start_token,
                                   eos_token=cfg.end_token,
                                   pad_token=cfg.pad_token,
                                   lower=True,
                                   batch_first=True,
                                   is_target=True)
    INDEX = data.Field(sequential=False, use_vocab=False, batch_first=True)

    # the order has to be the same as in the table
    fields = [
        ('video_id', None),
        ('caption', CAPTION),
        ('start', None),
        ('end', None),
        ('duration', None),
        ('phase', None),
        ('idx', INDEX),
    ]

    dataset = data.TabularDataset(
        path=cfg.train_meta_path,
        format='tsv',
        skip_header=True,
        fields=fields,
    )
    CAPTION.build_vocab(dataset.caption,
                        min_freq=cfg.min_freq_caps,
                        vectors=cfg.word_emb_caps)
    train_vocab = CAPTION.vocab

    if phase == 'val_1':
        dataset = data.TabularDataset(path=cfg.val_1_meta_path,
                                      format='tsv',
                                      skip_header=True,
                                      fields=fields)
    elif phase == 'val_2':
        dataset = data.TabularDataset(path=cfg.val_2_meta_path,
                                      format='tsv',
                                      skip_header=True,
                                      fields=fields)
    elif phase == 'learned_props':
        dataset = data.TabularDataset(path=cfg.val_prop_meta_path,
                                      format='tsv',
                                      skip_header=True,
                                      fields=fields)

    # sort_key = lambda x: data.interleave_keys(len(x.caption), len(y.caption))
    datasetloader = data.BucketIterator(dataset,
                                        batch_size,
                                        sort_key=lambda x: 0,
                                        device=torch.device(cfg.device),
                                        repeat=False,
                                        shuffle=True)
    return train_vocab, datasetloader
Esempio n. 10
0
from torchtext import datasets
from torchtext.vocab import GloVe
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
batch_size = 64
embedding_dim = 200
hidden_dim = 200
epochs = 5

# define Field
TEXT = data.ReversibleField(lower=True, include_lengths=True)
LABEL = data.Field(sequential=False)
# make splits for data
train, test = datasets.IMDB.splits(TEXT, LABEL)
# build the vocabulary
TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=embedding_dim))
LABEL.build_vocab(train)

train_iter, test_iter = data.BucketIterator.splits(
    (train, test),
    sort_key=lambda x: len(x.text),
    sort_within_batch=True,
    batch_size=batch_size,
    device=device,
    repeat=False)
Esempio n. 11
0
import torch
from NER_task.SequenceTaggingDataset import SequenceTaggingDataset
from torchtext import data,datasets
from torchtext import data
import torch
from torch import nn
from torchcrf import CRF
device = torch.device("cuda")


def light_tokenize(sequence: str):
    return [sequence]
TEXT = data.Field(sequential=True, tokenize=light_tokenize,include_lengths=True)
LABEL = data.ReversibleField(sequential=True, tokenize=light_tokenize, unk_token=None, is_target=True)

save_dir = 'save_models/model.pt'

train = SequenceTaggingDataset(
        path='../Datasets/NER_data/train.txt',separator=' ',
        fields=[('text', TEXT),
                ('label', LABEL)])

valid = SequenceTaggingDataset(
        path='../Datasets/NER_data/valid.txt',separator=' ',
        fields=[('text', TEXT),
                ('label', LABEL)])

TEXT.build_vocab(train)
LABEL.build_vocab(train)

train_iter, val_iter = data.BucketIterator.splits(
Esempio n. 12
0
def caption_iterator(start_token, end_token, pad_token, train_meta_path, val_1_meta_path,
                     val_2_meta_path, min_freq, batch_size, device, phase, use_categories, 
                     use_subs):
    spacy_en = spacy.load('en')
    print(f'Preparing dataset for {phase}')
    
    def tokenize_en(txt):
        return [token.text for token in spacy_en.tokenizer(txt)]
    
    CAPTION = data.ReversibleField(
        tokenize='spacy', init_token=start_token, 
        eos_token=end_token, pad_token=pad_token, lower=True, 
        batch_first=True, is_target=True
    )
    INDEX = data.Field(
        sequential=False, use_vocab=False, batch_first=True
    )
    if use_categories:
        # preprocessing: if there is no category replace with -1 (unique number)
        CATEGORY = data.Field(
            sequential=False, use_vocab=False, batch_first=True, 
            preprocessing=data.Pipeline(lambda x: -1 if len(x) == 0 else int(float(x)))
        )
        # filter the dataset if the a category is missing (31 -> 41 (count = 1 :()))
        filter_pred = lambda x: vars(x)['category_32'] != -1 and vars(x)['category_32'] != 31
    else:
        CATEGORY = None
        filter_pred = None
    
    if use_subs:
        SUBS = data.ReversibleField(
            tokenize='spacy', init_token=start_token, 
            eos_token=end_token, pad_token=pad_token, lower=True, 
            batch_first=True
        )
    else:
        SUBS = None
    
    # the order has to be the same as in the table
    fields = [
        ('video_id', None),
        ('caption', CAPTION),
        ('start', None),
        ('end', None),
        ('duration', None),
        ('category_32', CATEGORY),
        ('subs', SUBS),
        ('phase', None),
        ('idx', INDEX),
    ]

    dataset = data.TabularDataset(
        path=train_meta_path, format='tsv', skip_header=True, fields=fields,
        filter_pred=filter_pred
    )
    CAPTION.build_vocab(dataset.caption, min_freq=min_freq)
    train_vocab = CAPTION.vocab
    
    train_subs_vocab = None
    if use_subs:
        SUBS.build_vocab(dataset.subs, min_freq=min_freq)
        train_subs_vocab = SUBS.vocab
        
    if phase == 'val_1':
        dataset = data.TabularDataset(
            path=val_1_meta_path, format='tsv', skip_header=True, fields=fields,
            filter_pred=filter_pred
        )
    elif phase == 'val_2':
        dataset = data.TabularDataset(
            path=val_2_meta_path, format='tsv', skip_header=True, fields=fields, 
            filter_pred=filter_pred
        )
    # sort_key = lambda x: data.interleave_keys(len(x.caption), len(x.caption))
    sort_key = lambda x: 0 #len(x.caption)
    datasetloader = data.BucketIterator(
        dataset, batch_size, sort_key=sort_key, device=device, repeat=False, shuffle=True
    )
    return train_vocab, train_subs_vocab, datasetloader
Esempio n. 13
0
def main():
    ###############################
    # PREPROCESSING
    ###############################
    datasets = ["train", "val", "test"]
    for dataset in datasets:
        if not os.path.exists(os.path.join("data", dataset + ".tsv")):
            print("Creating TSV for " + dataset)
            convert_to_tsv(dataset)

    print("Creating datasets", end='', flush=True)
    curr_time = datetime.now()

    article_field = data.ReversibleField(tensor_type=torch.cuda.LongTensor,
                                         lower=True,
                                         tokenize=tokenizer_in)
    summary_field = data.ReversibleField(tensor_type=torch.cuda.LongTensor,
                                         lower=True,
                                         tokenize=tokenizer_out,
                                         init_token='<sos>')

    train_set = data.TabularDataset(path='./data/train.tsv',
                                    format='tsv',
                                    fields=[('article', article_field),
                                            ('summary', summary_field)])
    val_set = data.TabularDataset(path='./data/val.tsv',
                                  format='tsv',
                                  fields=[('article', article_field),
                                          ('summary', summary_field)])

    diff_time, curr_time = get_time_diff(curr_time)
    print(", took {} min".format(diff_time))

    print("Building vocabulary and creating batches", end='', flush=True)
    article_field.build_vocab(train_set,
                              vectors="glove.6B.100d",
                              max_size=encoder_vocab_size)
    summary_field.build_vocab(train_set, max_size=decoder_vocab_size)

    train_iter = data.BucketIterator(dataset=train_set,
                                     batch_size=batch_size,
                                     sort_key=lambda x: len(x.article),
                                     repeat=False,
                                     device=DEVICE)
    val_iter = data.BucketIterator(dataset=val_set,
                                   batch_size=batch_size,
                                   sort_key=lambda x: len(x.article),
                                   repeat=False,
                                   device=DEVICE)

    diff_time, curr_time = get_time_diff(curr_time)
    print(", took {} min".format(diff_time))
    ###############################
    # MODEL CREATION
    ###############################
    print("Creating encoder and decoder models", end='', flush=True)
    encoder = EncoderLSTM(input_size=encoder_vocab_size,
                          embed_size=embed_size,
                          hidden_size=encoder_hidden_size,
                          use_gpu=True,
                          gpu_device=DEVICE,
                          batch_size=batch_size)
    encoder.embedding.weight.data = article_field.vocab.vectors
    encoder.cuda(device=DEVICE)

    decoder = AttnDecoderLSTM(input_size=encoder_vocab_size,
                              embed_size=embed_size,
                              hidden_size=decoder_hidden_size,
                              output_size=decoder_vocab_size,
                              use_gpu=True,
                              gpu_device=DEVICE,
                              batch_size=batch_size)
    decoder.embedding.weight.data = article_field.vocab.vectors
    decoder.cuda(device=DEVICE)
    diff_time, curr_time = get_time_diff(curr_time)
    print(", took {} min".format(diff_time))

    # Loss and SGD optimizers
    loss_func = nn.NLLLoss(ignore_index=1)  # Ignore <pad> token
    encoder_opt = optim.Adam(encoder.parameters(), lr=lr)
    decoder_opt = optim.Adam(decoder.parameters(), lr=lr)

    ###############################
    # TRAINING
    ###############################
    print("Beginning training")
    tqdm_epoch = tqdm(range(num_epochs), desc="Epoch")
    for epoch in tqdm_epoch:
        train_iter.init_epoch()
        tqdm_batch = tqdm(train_iter, desc="Batch")
        for b_id, batch in enumerate(tqdm_batch):
            encoder.batch_size = batch.batch_size  # Fixes weird bug where we get batch sizes that are not batch_size
            decoder.batch_size = batch.batch_size
            avg_loss = train(batch, encoder, decoder, encoder_opt, decoder_opt,
                             loss_func, teacher_forcing_ratio)

    ###############################
    # TESTING
    ###############################
    # Load test set
    print("Loading test set")
    test_set = data.TabularDataset(path='./data/test.tsv',
                                   format='tsv',
                                   fields=[('article', article_field),
                                           ('summary', summary_field)])
    test_iter = data.BucketIterator(dataset=test_set,
                                    batch_size=batch_size,
                                    sort_key=lambda x: len(x.article),
                                    repeat=False,
                                    device=DEVICE)
    print("Evaluating model")
    evaluate(encoder=encoder,
             decoder=decoder,
             dataset=test_iter,
             rev_field=article_field)
Esempio n. 14
0
    return [tok.text for tok in nlp.tokenizer(text)]


def emb_tokenizer(l):
    r = [y for x in eval(l) for y in x]
    return r


def y_tokenize(y):
    return int(y)


TEXT = data.Field(sequential=True, tokenize=tokenizer, batch_first=True)
#LABEL = data.Field(sequential=False, use_vocab=True,batch_first=True)
LABEL = data.ReversibleField(sequential=False,
                             unk_token='OTHER',
                             use_vocab=True,
                             batch_first=True)
POS_EMB = data.Field(sequential=True, tokenize=emb_tokenizer, batch_first=True)

print('loading data...')
train, valid, test = data.TabularDataset.splits(
    path='../data/SemEval2010_task8_all_data',
    train='SemEval2010_task8_training/TRAIN_FILE_SUB.CSV',
    validation='SemEval2010_task8_training/VALID_FILE.CSV',
    test='SemEval2010_task8_testing_keys/TEST_FILE_FULL.CSV',
    format='csv',
    skip_header=True,
    csv_reader_params={'delimiter': '\t'},
    fields=[('relation', LABEL), ('sentence', TEXT), ('pos_embed', POS_EMB)])
print('load data end')
#print(valid[0].__dict__)
Esempio n. 15
0
    def __init__(self, config, lm_config, device):
        # define all fields
        TEXT = data.ReversibleField(sequential=True, tokenize=self.tokenizer,
                                    lower=False, include_lengths=False)
        POS = data.ReversibleField(sequential=True, lower=False, include_lengths=True)
        NER = data.ReversibleField(sequential=True, lower=False, include_lengths=True)
        LABEL = data.Field(sequential=False, use_vocab=False)
        IN_Q = data.Field(sequential=True, use_vocab=False, include_lengths=True,
                          postprocessing=self.to_numeric)
        IN_C = data.Field(sequential=True, use_vocab=False, include_lengths=True,
                          postprocessing=self.to_numeric)
        LEMMA_IN_Q = data.Field(sequential=True, use_vocab=False, include_lengths=True,
                                postprocessing=self.to_numeric)
        LEMMA_IN_C = data.Field(sequential=True, use_vocab=False, include_lengths=True,
                                postprocessing=self.to_numeric)
        TF = data.Field(sequential=True, use_vocab=False, include_lengths=True,
                        postprocessing=self.to_numeric)
        REL = data.ReversibleField(sequential=True, lower=False, include_lengths=True)

        # load lm data first
        lm_train = datasets.LanguageModelingDataset(os.path.join(lm_config.file_path, lm_config.train_f),
                                                    TEXT, newline_eos=False)
        lm_dev = datasets.LanguageModelingDataset(os.path.join(lm_config.file_path, lm_config.dev_f),
                                                  TEXT, newline_eos=False)

        # load actual data
        # we have keys: 'id', 'd_words', 'd_pos', 'd_ner', 'q_words', 'q_pos', 'c_words',
        #       'label', 'in_q', 'in_c', 'lemma_in_q', 'tf', 'p_q_relation', 'p_c_relation'
        train, val, test = data.TabularDataset.splits(
            path=config.data_dir, train=config.train_fname,
            validation=config.dev_fname, test=config.test_fname, format='json',
            fields={'d_words': ('d_words', TEXT),
                    'd_pos':   ('d_pos', POS),
                    'd_ner':   ('d_ner', NER),
                    'q_words': ('q_words', TEXT),
                    'q_pos':   ('q_pos', POS),
                    'c_words': ('c_words', TEXT),
                    'label': ('label', LABEL),
                    'in_q': ('in_q', IN_Q),
                    'in_c': ('in_c', IN_C),
                    'lemma_in_q': ('lemma_in_q', LEMMA_IN_Q),
                    'lemma_in_c': ('lemma_in_c', LEMMA_IN_C),
                    'tf': ('tf', TF),
                    'p_q_relation': ('p_q_relation', REL),
                    'p_c_relation': ('p_c_relation', REL)
                    })

        print('train: %d, val: %d, test: %d' % (len(train), len(val), len(test)))

        # construct vocabulary
        TEXT.build_vocab(train, val, test, lm_train, lm_dev, vectors=config.vectors)
        POS.build_vocab(train, val, test)
        NER.build_vocab(train, val, test)
        REL.build_vocab(train, val, test)

        print('vocab size: %d' % len(TEXT.vocab))
        print('pos size: %d' % len(POS.vocab))
        print('ner size: %d' % len(NER.vocab))
        print('rel size: %d' % len(REL.vocab))

        self.TEXT = TEXT

        # iterators
        self.lm_train_iter = data.BPTTIterator(lm_train, batch_size=lm_config.batch_size,
                                               bptt_len=lm_config.bptt_len, repeat=False)
        self.lm_dev_iter = data.BPTTIterator(lm_dev, batch_size=lm_config.batch_size,
                                             bptt_len=lm_config.bptt_len, repeat=False)

        print('lm train batch num: %d, lm dev batch num: %d' %
              (len(self.lm_train_iter), len(self.lm_dev_iter)))

        self.train_iter = data.BucketIterator(dataset=train, batch_size=config.batch_size_train,
                                              sort_key=lambda x: len(x.d_words), device=device, shuffle=True,
                                              sort_within_batch=False, repeat=False)

        self.val_iter = data.Iterator(dataset=val, batch_size=config.batch_size_eval,
                                      sort_key=lambda x: len(x.d_words),
                                      train=False, shuffle=False, sort_within_batch=False, device=device,
                                      repeat=False)

        self.test_iter = data.Iterator(dataset=test, batch_size=config.batch_size_test,
                                       sort_key=lambda x: len(x.d_words), train=False, shuffle=False,
                                       sort_within_batch=False, device=device, repeat=False)

        print('train batch num: %d, dev batch num: %d' %
              (len(self.train_iter), len(self.val_iter)))

        # # Create embeddings
        embedding = nn.Embedding(len(TEXT.vocab), config.embed_dim)
        embedding.weight.data.copy_(TEXT.vocab.vectors)
        embedding.weight.requires_grad = False
        self.embedding = embedding.to(device)

        embedding_pos = nn.Embedding(len(POS.vocab), config.embed_dim_pos)
        embedding_pos.weight.data.normal_(0, 0.1)
        self.embedding_pos = embedding_pos.to(device)

        embedding_ner = nn.Embedding(len(NER.vocab), config.embed_dim_ner)
        embedding_ner.weight.data.normal_(0, 0.1)
        self.embedding_ner = embedding_ner.to(device)

        embedding_rel = nn.Embedding(len(REL.vocab), config.embed_dim_rel)
        embedding_rel.weight.data.normal_(0, 0.1)
        self.embedding_rel = embedding_rel.to(device)

        print('embedding', self.embedding)
        print('embedding_pos', self.embedding_pos)
        print('embedding_ner', self.embedding_ner)
        print('embedding_rel', self.embedding_rel)

        self.vocab_size = len(TEXT.vocab)
        print('vocab_size is', self.vocab_size)
Esempio n. 16
0
    with open('../../data/clinvar/text_classification_db_labels.json',
              'r') as f:
        labels = json.load(f)

    # map labels to list
    label_list = [None] * len(labels)
    for k, v in labels.items():
        label_list[v] = k

    labels = label_list
    logger.info("available labels: ")
    logger.info(labels)

    TEXT = data.ReversibleField(sequential=True,
                                tokenize=tokenizer,
                                lower=True,
                                include_lengths=True)
    LABEL = data.Field(sequential=False, use_vocab=False)
    if args.dataset == 'merged':
        train, val, test = data.TabularDataset.splits(
            path='../../data/clinvar/',
            train='merged_text_classification_db_train.tsv',
            validation='merged_text_classification_db_valid.tsv',
            test='merged_text_classification_db_test.tsv',
            format='tsv',
            fields=[('Text', TEXT), ('Description', LABEL)])
    else:
        train, val, test = data.TabularDataset.splits(
            path='../../data/clinvar/',
            train='text_classification_db_train.tsv',
            validation='text_classification_db_valid.tsv',
Esempio n. 17
0
def main():
    parser = argparse.ArgumentParser()

    # parser.add_argument('-data', required=True)

    parser.add_argument('-max_len', '--max_word_seq_len', type=int, default=50)

    parser.add_argument('-epoch', type=int, default=10)
    parser.add_argument('-batch_size', type=int, default=64)

    parser.add_argument('-d_word_vec', type=int, default=512)
    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=1024)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-n_warmup_steps', type=int, default=4000)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', default=None)
    parser.add_argument('-save_model', default=None)
    parser.add_argument('-save_mode',
                        type=str,
                        choices=['all', 'best'],
                        default='best')

    parser.add_argument('-no_cuda', action='store_true')

    parser.add_argument('-beam_size', type=int, default=5, help='Beam size')
    parser.add_argument('-n_best',
                        type=int,
                        default=1,
                        help="""If verbose is set, will output the n_best
                            decoded sentences""")

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model

    # 快速變更設定
    # opt.n_layers = 1
    # opt.batch_size = 4
    opt.cuda = torch.cuda.is_available()
    opt.epoch = 2000
    opt.save_model = 'trained'
    opt.model = 'trained.chkpt'

    opt.d_word_vec = 300
    opt.d_model = 300
    opt.d_inner_hid = 600

    opt.embs_share_weight = True

    opt.beam_size = 1

    opt.max_len = 50
    opt.max_token_seq_len = opt.max_len + 2  # 包含<BOS>, <EOS>

    opt.device = None if torch.cuda.is_available() else -1

    # =========== prepare dataset ===========
    def len_filter(example):
        return len(example.src) <= opt.max_len and len(
            example.tgt) <= opt.max_len

    EN = data.ReversibleField(init_token=Constants.BOS_WORD,
                              eos_token=Constants.EOS_WORD,
                              batch_first=True)
    train, val = Lang8.splits(exts=('.err.bpe', '.cor.bpe'),
                              fields=[('src', EN), ('tgt', EN)],
                              train='test',
                              validation='test',
                              test=None,
                              filter_pred=len_filter)
    # adv_train, adv_dev, adv_test = Lang8.splits(
    #     exts=('.adv.cor', '.adv.err'), fields=[('src', src), ('tgt', tgt)],
    #     train='test', validation='test', test='test')
    # BD.build_vocab(train, vectors=[GloVe(name='840B', dim='300'), CharNGram(), FastText()])
    # GD.build_vocab(train, vectors=[GloVe(name='840B', dim='300'), CharNGram(), FastText()])
    EN.build_vocab(train, vectors=FastText())
    print('vocab len: %d' % len(EN.vocab))

    # 檢查Constants是否有誤
    assert EN.vocab.stoi[EN.init_token] == Constants.BOS
    assert EN.vocab.stoi[EN.eos_token] == Constants.EOS
    assert EN.vocab.stoi[EN.pad_token] == Constants.PAD
    assert EN.vocab.stoi[EN.unk_token] == Constants.UNK

    # ---------- init model ----------
    # if opt.embs_share_weight and train.src_word2idx != train.tgt_word2idx:
    #     print('[Warning] The src/tgt word2idx table are different but asked to share word embedding.')

    print(opt)

    transformer = Transformer(
        len(EN.vocab),
        len(EN.vocab),
        opt.max_token_seq_len,
        proj_share_weight=opt.proj_share_weight,
        embs_share_weight=opt.embs_share_weight,
        d_k=opt.d_k,
        d_v=opt.d_v,
        d_model=opt.d_model,
        d_word_vec=opt.d_word_vec,
        d_inner_hid=opt.d_inner_hid,
        n_layers=opt.n_layers,
        n_head=opt.n_head,
        dropout=opt.dropout,
        encoder_emb_weight=EN.vocab.vectors,
        decoder_emb_weight=EN.vocab.vectors,
    )

    discriminator = TestDiscriminator(
        len(EN.vocab),
        d_model=300,
        max_len=opt.max_token_seq_len,
    )

    print(transformer)
    print(discriminator)

    optimizer = ScheduledOptim(
        optim.Adam(transformer.get_trainable_parameters(),
                   betas=(0.9, 0.98),
                   eps=1e-09), opt.d_model, opt.n_warmup_steps)

    optimizer_G = optim.Adam(transformer.get_trainable_parameters(),
                             lr=1e-4,
                             betas=(0.5, 0.9))
    optimizer_D = optim.Adam(discriminator.parameters(),
                             lr=1e-4,
                             betas=(0.5, 0.9))

    def get_criterion(vocab_size):
        ''' With PAD token zero weight '''
        weight = torch.ones(vocab_size)
        weight[Constants.PAD] = 0
        return nn.CrossEntropyLoss(weight, size_average=False)

    crit = get_criterion(len(EN.vocab))

    if opt.cuda:
        transformer.cuda()
        discriminator.cuda()
        crit.cuda()

    # =========== training ===========
    supervised_trainer = trainers.TransformerTrainer()
    # trainer.train(transformer, train, val, crit, optimizer, opt, GD)

    # train_iter, val_iter = data.BucketIterator.splits(
    #     (train, val), batch_sizes=(4, 256), device=opt.device,
    #     sort_key=lambda x: len(x.src))
    # batch = next(iter(train_iter))
    # src_seq = batch.src
    # tgt_seq = batch.tgt
    # src_pos = transformer.get_position(src_seq.data)
    # tgt_pos = transformer.get_position(tgt_seq.data)
    #
    # # print(tgt_seq)
    # # print(src_pos)
    # # print(tgt_pos)
    #
    # transformer(src_seq, src_pos, tgt_seq, tgt_pos)
    # output = transformer(src_seq, src_pos)
    # print(output)
    #
    # print(discriminator(output))

    # =========== WGAN training ===========
    wgan_trainer = WganTrainer(opt)
    train_iter, val_iter = data.BucketIterator.splits(
        (train, val),
        batch_sizes=(16, 64),
        device=opt.device,
        sort_key=lambda x: len(x.src),
        repeat=False)

    for epoch in range(opt.epoch):
        print('[Epoch %d]' % epoch)
        wgan_trainer.train_epoch(epoch,
                                 D=discriminator,
                                 G=transformer,
                                 optimizer_D=optimizer_D,
                                 optimizer_G=optimizer_G,
                                 train_iter=train_iter,
                                 n_tgt_vocab=len(EN.vocab))
        valid_loss, valid_accu, bleu = supervised_trainer.evaluate(
            transformer, val_iter, crit, EN)
        print('(Validation) ppl: %8.5f, accuracy: %3.3f%%, BLEU %2.2f' %
              (math.exp(min(valid_loss, 100)), 100 * valid_accu, bleu))