Beispiel #1
0
def create_dataset(file_type, folder, train_diffs, train_msgs, test_diffs,
                   test_msgs, valid_diffs, valid_msgs):

    (train_diffs, train_msgs, train_cnt, vocab_diffs,
     vocab_msgs) = get_dataset(file_type, train_diffs, train_msgs)
    test_diffs, test_msgs, test_cnt, _, _ = get_dataset(
        file_type, test_diffs, test_msgs)
    valid_diffs, valid_msgs, valid_cnt, _, _ = get_dataset(
        file_type, valid_diffs, valid_msgs)

    remove_dir(folder)
    make_dirs(folder)

    save_dataset(folder, "train." + str(train_cnt), train_diffs, train_msgs)
    save_dataset(folder, "test." + str(test_cnt), test_diffs, test_msgs)
    save_dataset(folder, "valid." + str(valid_cnt), valid_diffs, valid_msgs)
    save_vocab(folder, vocab_diffs, vocab_msgs)
Beispiel #2
0
    def train_compass(self, compass_text, overwrite=False):
        compass_exists = os.path.isfile(
            os.path.join(self.opath, "compass.model"))
        if compass_exists and overwrite is False:
            self.compass = Word2Vec.load(
                os.path.join(self.opath, "compass.model"))
            print("Compass loaded from file.")
        else:
            sentences = CreateCorpus(compass_text)
            print("Training the compass.")
            if compass_exists:
                print("Compass will be overwritten after training")
            self.compass = self.train_model(sentences)
            self.compass.save(os.path.join(self.opath, "compass.model"))
            save_vocab(self.compass, "compass_twec")

        self.gvocab = self.compass.wv.vocab
Beispiel #3
0
    def train_slice(self, slice_text, save=True):
        if self.compass == None:
            return Exception("Missing Compass")
        print("Training temporal embeddings: slice {}.".format(slice_text))

        sentences = CreateCorpus(slice_text)

        model = self.train_model(sentences)
        model_name = os.path.splitext(os.path.basename(slice_text))[0]
        self.trained_slices[model_name] = model

        # modified saving function to save in w2v format
        if save:
            model.save(os.path.join(self.opath, model_name + ".model"))
            # Save vocab
            save_vocab(model, f'{model_name}_twec')

        return self.trained_slices[model_name]
Beispiel #4
0
def main(
    path_data: str,
    epochs: int,
    batch: int,
    vector_size: int,
    window: int,
    path_vectors: str,
    max_vocab: int,
    min_count: int,
    alpha: float,
    lr: float,
    x_max: int,
    save_mode: int,
):
    print("Preprocessing...")
    first_indices, second_indices, freq, word_index, word_counts = preprocessing(
        path_data, max_vocab, min_count, window)

    vocab_size = len(word_counts) + 1
    print("Vocab size:", vocab_size)
    print("Training...")
    model = train(
        first_indices=first_indices,
        second_indices=second_indices,
        frequencies=freq,
        epochs=epochs,
        batch=batch,
        vector_size=vector_size,
        vocab_size=vocab_size,
        alpha=alpha,
        lr=lr,
        x_max=x_max,
    )
    print("Saving vocab...")
    utils.save_vocab(config.VOCAB, word_counts)
    print("Saving embeddings file...")
    # path_folder = config.EMBEDDINGS.split("/")[0]
    # if not os.path.isdir(path_folder):
    #     os.mkdir(path_folder)
    utils.save_word2vec_format(model, path_vectors, word_index, vector_size,
                               save_mode)
        raise NotImplementedError()

    image_postfix = ".png"

helper = DataHelper(args.annot_file, args.ques_file)

# Write dataset to file
with open(args.output_file, "w") as output_file:
    for i in range(len(helper.dataset['annotations'])):

        imd_id = helper.dataset['annotations'][i]['image_id']
        img_name = image_prefix + pad_with_zero(imd_id, args) + image_postfix

        ques_id = helper.dataset['annotations'][i]['question_id']
        question = helper.qqa[ques_id]['question']

        # Convert to comma-separated token string
        question = ','.join(question.strip().split())

        answer = helper.dataset['annotations'][i]['multiple_choice_answer']

        # each line contains: image_filename [tab] question [tab] answer
        output_file.write(img_name + "\t" + question + "\t" + answer + "\n")

print('Saved dataset file at: {}'.format(args.output_file))

# Read the newly created dataset file to build the vocabulary & save to disk
if args.vocab_file:
    save_vocab(args.output_file, args.vocab_file, args.min_word_count,
               args.num_cls)
Beispiel #6
0
def main(args):
    device = torch.device('cuda' if args.gpu else 'cpu')

    # construct Field objects
    SRC = data.Field(lower=True, init_token='<bos>', eos_token='<eos>')
    TGT = data.Field(lower=True, init_token='<bos>', eos_token='<eos>')
    fields = [('src', SRC), ('tgt', TGT)]

    slen_filter = lambda x: args.src_minlen <= len(x.src) <= args.src_maxlen \
                    and args.tgt_minlen <= len(x.tgt) <= args.tgt_maxlen

    train_data = data.TabularDataset(
        path=args.train,
        format='tsv',
        fields=fields,
        filter_pred=slen_filter,
    )

    valid_data = data.TabularDataset(
        path=args.valid,
        format='tsv',
        fields=fields,
        filter_pred=slen_filter,
    )

    # construct Vocab objects
    SRC.build_vocab(train_data, min_freq=args.src_min_freq)
    if args.src_embed_path is not None:
        vector = utils.load_vector(args.src_embed_path)
        SRC.vocab.load_vectors(vector)

    TGT.build_vocab(train_data, min_freq=args.tgt_min_freq)
    if args.tgt_embed_path is not None:
        vector = utils.load_vector(args.tgt_embed_path)
        TGT.vocab.load_vectors(vector)

    # save fields
    if not os.path.exists(args.savedir):
        os.mkdir(args.savedir)

    utils.save_field(args.savedir, fields)
    utils.save_vocab(args.savedir, fields)

    # set iterator
    train_iter, valid_iter = data.BucketIterator.splits(
        (train_data, valid_data),
        batch_size=args.batch_size,
        sort_within_batch=True,
        sort_key=lambda x: len(x.src),
        repeat=False,
    )

    print(f'| [src] Dictionary: {len(SRC.vocab.itos)} types')
    print(f'| [tgt] Dictionary: {len(TGT.vocab.itos)} types')
    print('')

    for iter_name, iterator in [('train', train_iter), ('valid', valid_iter)]:
        file_path = args.train if iter_name == 'train' else args.valid
        data_object = train_data if iter_name == 'train' else valid_data
        print(f' {iter_name}: {file_path}')
        for name, field in fields:
            n_tokens, n_unk = utils.get_statics(iterator, name, field)
            n_tokens -= 2 * len(
                data_object)  # take <bos> and <eos> from n_tokens
            print(f'| [{name}] {n_tokens} tokens,', end='')
            print(f' coverage: {100*(n_tokens-n_unk)/n_tokens:.{4}}%')
        print('')

    # construct model
    model = Transformer(fields, args).to(device)
    criterion = nn.CrossEntropyLoss(ignore_index=TGT.vocab.stoi['<pad>'])
    optimizer_fn = utils.get_optimizer(args.optimizer)
    optimizer = optimizer_fn(model.parameters(), lr=args.lr)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                     mode='min',
                                                     patience=5)
    trainer = Trainer(model,
                      criterion,
                      optimizer,
                      scheduler,
                      args.clip,
                      iteration=0)

    print('=============== MODEL ===============')
    print(model)
    print('')
    print('=============== OPTIMIZER ===============')
    print(optimizer)
    print('')

    epoch = 1
    max_epoch = args.max_epoch or math.inf
    max_update = args.max_update or math.inf
    best_loss = math.inf

    while epoch < max_epoch and trainer.n_updates < max_update and args.min_lr < trainer.get_lr(
    ):
        # train
        with tqdm(train_iter, dynamic_ncols=True) as pbar:
            train_loss = 0.0
            trainer.model.train()
            for samples in pbar:
                bsz = samples.src.size(1)
                srcs = samples.src.to(device)
                tgts = samples.tgt.to(device)
                loss = trainer.step(srcs, tgts)
                train_loss += loss.item()

                # setting of progressbar
                pbar.set_description(f"epoch {str(epoch).zfill(3)}")
                progress_state = OrderedDict(loss=loss.item(),
                                             ppl=math.exp(loss.item()),
                                             bsz=len(samples),
                                             lr=trainer.get_lr(),
                                             clip=args.clip,
                                             num_updates=trainer.n_updates)
                pbar.set_postfix(progress_state)
        train_loss /= len(train_iter)

        print(f"| epoch {str(epoch).zfill(3)} | train ", end="")
        print(f"| loss {train_loss:.{4}} ", end="")
        print(f"| ppl {math.exp(train_loss):.{4}} ", end="")
        print(f"| lr {trainer.get_lr():.1e} ", end="")
        print(f"| clip {args.clip} ", end="")
        print(f"| num_updates {trainer.n_updates} |")

        # validation
        valid_loss = 0.0
        trainer.model.eval()
        for samples in valid_iter:
            bsz = samples.src.size(1)
            srcs = samples.src.to(device)
            tgts = samples.tgt.to(device)
            loss = trainer.step(srcs, tgts)
            valid_loss += loss.item()
        valid_loss /= len(valid_iter)

        print(f"| epoch {str(epoch).zfill(3)} | valid ", end="")
        print(f"| loss {valid_loss:.{4}} ", end="")
        print(f"| ppl {math.exp(valid_loss):.{4}} ", end="")
        print(f"| lr {trainer.get_lr():.1e} ", end="")
        print(f"| clip {args.clip} ", end="")
        print(f"| num_updates {trainer.n_updates} |")

        # saving model
        save_vars = {
            'epoch': epoch,
            'iteration': trainer.n_updates,
            'best_loss': valid_loss if valid_loss < best_loss else best_loss,
            'args': args,
            'weights': model.state_dict()
        }

        if valid_loss < best_loss:
            filename = os.path.join(args.savedir, 'checkpoint_best.pt')
            torch.save(save_vars, filename)
        if epoch % args.save_epoch == 0:
            filename = os.path.join(args.savedir, f'checkpoint_{epoch}.pt')
            torch.save(save_vars, filename)
        filename = os.path.join(args.savedir, 'checkpoint_last.pt')
        torch.save(save_vars, filename)

        # update
        trainer.scheduler.step(valid_loss)
        epoch += 1
# RNN_UNITS = 512
# RNN_TYPE = 'lstm'

##############################

vocab_tone, idx2char_tone, char2idx_tone = build_vocab_tone(tone_dataframe)

dataset_tone = build_dataset_tone(tone_dataframe, vocab_tone, idx2char_tone,
                                  char2idx_tone, MAX_WORD_LENGTH)

# Path where the vocab will be saved
logs_dir = os.path.join(working_dir, 'logs')
os.makedirs(logs_dir, exist_ok=True)
vocab_file_tone = os.path.join(logs_dir, 'vocab_tone.json')

save_vocab(vocab_tone, idx2char_tone, char2idx_tone, vocab_file_tone)

dataset_train_tone, dataset_val_tone = split_dataset(dataset_tone)

dataset_train_tone = dataset_train_tone.batch(BATCH_SIZE, drop_remainder=True)
dataset_val_tone = dataset_val_tone.batch(BATCH_SIZE, drop_remainder=True)

model_tone = build_tonenet_model(
    name='ToneNetwork',
    vocab_size=len(vocab_tone),
    max_word_len=MAX_WORD_LENGTH,
    embedding_dim=EMBEDDING_DIM,
    rnn_type=RNN_TYPE,
    rnn_units=RNN_UNITS,
    learning_rate=0.01,
)
# Load the training dataset

word_sents, label_sents = conll_dataset_to_word_AND_label_sents("train")

# Count the number of occurences of each lowercased word

nb_occurs = {}
for word_sent in word_sents:
    for word in word_sent:
        lword = word.lower()
        if lword not in nb_occurs:
            nb_occurs[lword] = 0
        nb_occurs[lword] += 1

# Keep only the most frequent words
# This is done to improve generalization on never-seen-before words.

sorted_nb_occurs = sorted(nb_occurs.items(),
                          key=lambda kv: kv[1],
                          reverse=True)
sorted_nb_occurs = sorted_nb_occurs[:int(args.keep * len(nb_occurs))]

# Build and save vocabulary
# Rk : Id 0 is reserved for padding and id 1 for never-seen-before words.

vocab = {}
for i, (lword, nb_occurs) in enumerate(sorted_nb_occurs):
    vocab[lword] = i + 2
save_vocab(vocab)
Beispiel #9
0
def main(args):
    device = torch.device('cuda' if args.gpu  else 'cpu')

    if args.re_training is None:
        TEXT = data.Field(
            lower=True, 
            init_token='<bos>', 
            eos_token='<eos>'
        )
    else: 
        basedir, _ = os.path.split(args.re_training)
        path = os.path.join(basedir, 'text.field')
        TEXT = utils.load_field(path)

    fields = [('text', TEXT)] if args.task in monolingual_tasks \
                else [('src', TEXT), ('tgt', TEXT)]

    slen_filter = lambda x: args.src_minlen <= len(x.src) <= args.src_maxlen \
                         and args.tgt_minlen <= len(x.tgt) <= args.tgt_maxlen

    # load training data
    if args.task == 'translation':
        train_data = data.TabularDataset(
                path=args.train,
                format='tsv',
                fields=fields,
                filter_pred=slen_filter,
        )
    else: # `causal`, `masked`
        train_data = datasets.LanguageModelingDataset(
            path=args.train, 
            text_field=TEXT, 
            newline_eos=True
        )

    # set Vocabulary object
    if args.re_training is None:
        TEXT.build_vocab(
            train_data, 
            min_freq=args.min_freq, 
            specials=['<sep>', '<mask>'], 
        )
        if args.embed_path:
            vectors = utils.load_vector(args.embed_path)
            TEXT.vocab.load_vectors(vectors)

    if not os.path.exists(args.savedir):
        os.mkdir(args.savedir)

    # save a field object
    with open(os.path.join(args.savedir, 'text.field'), 'wb') as fout:
        dill.dump(TEXT, fout)
    utils.save_vocab(args.savedir, TEXT)

    # set training iterator
    if args.task == 'translation':
        train_iter = data.BucketIterator(
            train_data, 
            batch_size=args.batch_size,
            sort_within_batch=True,
            sort_key= lambda x: len(x.src),
            repeat=False,
        )
    else: # `causal`, `masked`
        train_iter = data.BPTTIterator(
            train_data, 
            batch_size=args.batch_size, 
            bptt_len=args.bptt_len,
            train=True, 
            repeat=False, 
            shuffle=True,
        )

    print(f'| [text] Dictionary: {len(TEXT.vocab.itos)} types')
    print('')

    print(f'train: {args.train}')
    for name, field in fields:
        n_tokens, n_unk = utils.get_statics(train_iter, name, field)
        print(f'| [{name}] {n_tokens} tokens,', end='')
        print(f' coverage: {100*(n_tokens-n_unk)/n_tokens:.{4}}%')
    print('')

    # build a model
    model_class = get_model(args.task)

    if  args.re_training is None:
        epoch = 1
        iteration = 0
        best_loss = math.inf
        model = model_class(TEXT, args).to(device)
    else:
        load_vars = torch.load(args.re_training)
        epoch = load_vars['epoch'] + 1
        iteration = load_vars['iteration']
        best_loss = load_vars['best_loss']
        lm_args, lm_weights = load_vars['args'], load_vars['weights']
        model = model_class(TEXT, lm_args)
        model.load_state_dict(lm_weights)
        model.to(device)

    criterion = nn.CrossEntropyLoss(ignore_index=TEXT.vocab.stoi['<pad>'])
    optimizer_fn = utils.get_optimizer(args.optimizer)
    optimizer = optimizer_fn(model.parameters(), lr=args.lr)
    trainer = Trainer(model, criterion, optimizer, args.clip, iteration)

    # show the details of model and optimizer
    print('=============== MODEL ===============')
    print(model)
    print('')
    print('=============== OPTIMIZER ===============')
    print(optimizer)
    print('')

    max_epoch = args.max_epoch or math.inf
    max_update = args.max_update or math.inf
    assert not(max_epoch == math.inf and max_update == math.inf), \
        'Please set `--max-epoch` or `--max-update`.'
 
    while epoch <= max_epoch and trainer.n_updates <= max_update:
        # training
        with tqdm(train_iter, dynamic_ncols=True) as pbar:
            train_loss = 0.0
            trainer.model.train()
            for samples in pbar:
                if args.task in monolingual_tasks:
                    srcs = samples.text.to(device)
                    tgts = None
                    refs = None if args.task == 'masked' \
                            else samples.target.to(device)
                else:
                    srcs = samples.src.to(device)
                    tgts = samples.tgt.to(device)
                    refs = None
                loss = trainer.step(srcs, tgts, refs)
                train_loss += loss.item()

                # setting of progressbar
                pbar.set_description(f'epoch {str(epoch).zfill(3)}')
                progress_state = OrderedDict(
                    task=args.task,
                    loss=loss.item(),
                    ppl=math.exp(loss.item()),
                    bsz=srcs.size(1),
                    lr=trainer.get_lr(), 
                    clip=args.clip, 
                    num_updates=trainer.n_updates)
                pbar.set_postfix(progress_state)
        train_loss /= len(train_iter)

        print(f'| epoch {str(epoch).zfill(3)} | train ', end='') 
        print(f'| loss {train_loss:.{4}} ', end='')
        print(f'| ppl {math.exp(train_loss):.{4}} ', end='')
        print(f'| lr {trainer.get_lr():.1e} ', end='')
        print(f'| clip {args.clip} ', end='')
        print(f'| num_updates {trainer.n_updates} |')
        
        # saving model
        save_vars = {
            'epoch': epoch,
            'iteration': trainer.n_updates,
            'best_loss': train_loss if train_loss < best_loss else best_loss,
            'args': args, 
            'weights': model.state_dict()
        }

        if train_loss < best_loss:
            best_loss = train_loss
            filename = os.path.join(args.savedir, 'checkpoint_best.pt') 
            torch.save(save_vars, filename)
        if epoch % args.save_epoch == 0:
            filename = os.path.join(args.savedir, f'checkpoint_{epoch}.pt') 
            torch.save(save_vars, filename)
        filename = os.path.join(args.savedir, 'checkpoint_last.pt') 
        torch.save(save_vars, filename)

        # update
        epoch += 1
cfg.seed = cfg.seed if cfg.seed else random.randint(1, 10000)
print('Random seed: {}'.format(cfg.seed))
torch.manual_seed(cfg.seed)
np.random.seed(cfg.seed)
random.seed(cfg.seed)
result_json = pjoin(cfg.savepath,
                    'result.json') if cfg.resume_result_json else None

# DATA
dataset = AttributeDataLoader(mbsize=cfg.vae.batch_size,
                              max_seq_len=cfg.max_seq_len,
                              device=device,
                              attributes=cfg.attributes,
                              **cfg.data_kwargs)
dataset.print_stats()
utils.save_vocab(dataset.TEXT.vocab, cfg.vocab_path)

# MODEL
if cfg.model.pretrained_emb:
    cfg.model.pretrained_emb = dataset.get_vocab_vectors()

model = RNN_VAE(n_vocab=dataset.n_vocab,
                max_seq_len=cfg.max_seq_len,
                **cfg.model).to(device)
print(model)

if cfg.loadpath:
    model.load_state_dict(torch.load(cfg.loadpath))
    print('Loaded model from ' + cfg.loadpath)

# ---------------------------------------------#
##############################

vocab_rhyme, idx2syl_rhyme, syl2idx_rhyme = build_vocab_rhyme(divine_comedy)

dataset_rhyme = build_dataset_rhyme(divine_comedy,
                                    vocab_rhyme,
                                    idx2syl_rhyme,
                                    syl2idx_rhyme,
                                    seq_length=SEQ_LENGTH)

# Path where the vocab will be saved
logs_dir = os.path.join(working_dir, 'logs')
os.makedirs(logs_dir, exist_ok=True)
vocab_file_rhyme = os.path.join(logs_dir, 'vocab_rhyme.json')

save_vocab(vocab_rhyme, idx2syl_rhyme, syl2idx_rhyme, vocab_file_rhyme)

dataset_train_rhyme, dataset_val_rhyme = split_dataset(dataset_rhyme)

dataset_train_rhyme = dataset_train_rhyme.batch(BATCH_SIZE,
                                                drop_remainder=True)
dataset_val_rhyme = dataset_val_rhyme.batch(BATCH_SIZE, drop_remainder=True)

model_rhyme = build_model(
    name='RhymeNetwork',
    vocab_size=len(vocab_rhyme),
    seq_length=SEQ_LENGTH,
    embedding_dim=EMBEDDING_DIM,
    rnn_type=RNN_TYPE,
    rnn_units=RNN_UNITS,
    learning_rate=0.01,
Beispiel #12
0
def main(args):
    device = torch.device('cuda' if args.gpu else 'cpu')

    if args.model:
        basedir, _ = os.path.split(args.model)
        path = os.path.join(basedir, 'text.field')
        TEXT = utils.load_field(path)
    else:
        TEXT = data.Field(lower=True, init_token='<bos>', eos_token='<eos>')

    fields = [('src', TEXT), ('tgt', TEXT)] if args.mode else [('src', TEXT)]

    # load training data
    if args.mode == 'finetune':
        slen_filter = lambda x: args.src_minlen <= len(x.src) <= args.src_maxlen \
                             and args.tgt_minlen <= len(x.tgt) <= args.tgt_maxlen

        train_data = data.TabularDataset(
            path=args.train,
            format='tsv',
            fields=fields,
            filter_pred=slen_filter,
        )
    else:  # pretrain
        train_data = datasets.LanguageModelingDataset(path=args.train,
                                                      text_field=TEXT,
                                                      newline_eos=True)

    # set Vocabulary object
    if args.model is None:
        TEXT.build_vocab(
            train_data,
            min_freq=args.min_freq,
            specials=['<sep>', '<mask>'],
        )

    if not os.path.exists(args.savedir):
        os.mkdir(args.savedir)

    utils.save_field(args.savedir, [('text', TEXT)])
    utils.save_vocab(args.savedir, [('text', TEXT)])

    # set training iterator
    if args.mode == 'finetune':
        train_iter = data.BucketIterator(
            train_data,
            batch_size=args.batch_size,
            sort_within_batch=True,
            sort_key=lambda x: len(x.src),
            repeat=False,
        )
    else:  # pre-train
        train_iter = datasets.BPTTIterator(
            train_data,
            batch_size=args.batch_size,
            bptt_len=args.bptt_len,
            train=True,
            repeat=False,
            shuffle=True,
        )

    print(f'| [text] Dictionary: {len(TEXT.vocab.itos)} types')
    print('')

    print(f' train: {args.train}')
    utils.get_stats(train_iter, fields)

    # load validation data
    if args.valid is not None:
        if args.mode == 'finetune':
            valid_data = data.TabularDataset(
                path=args.valid,
                format='tsv',
                fields=fields,
                filter_pred=slen_filter,
            )

            valid_iter = data.BucketIterator(valid_data,
                                             batch_size=args.batch_size,
                                             sort_within_batch=True,
                                             sort_key=lambda x: len(x.src),
                                             train=False,
                                             repeat=False,
                                             shuffle=False)
        else:  # pre-train
            valid_data = datasets.LanguageModelingDataset(path=args.valid,
                                                          text_field=TEXT,
                                                          newline_eos=True)

            valid_iter = datasets.BPTTIterator(
                valid_data,
                batch_size=args.batch_size,
                bptt_len=args.bptt_len,
                train=False,
                repeat=False,
                shuffle=False,
            )

        print(f'valid: {args.valid}')
        utils.get_stats(valid_iter, fields)

    # build a model
    if args.model:
        load_vars = torch.load(args.model)
        epoch = load_vars['epoch'] + 1
        best_loss = load_vars['best_loss']
        lm_args, lm_weights = load_vars['args'], load_vars['weights']
        model = TranslationLM(TEXT, lm_args)
        model.load_state_dict(lm_weights)
        model.to(device)
    else:
        epoch = 1
        best_loss = math.inf
        model = TranslationLM(TEXT, args).to(device)

    criterion = nn.CrossEntropyLoss(ignore_index=TEXT.vocab.stoi['<pad>'])

    optimizer_fn = utils.get_optimizer(args.optimizer)
    optimizer = optimizer_fn(model.parameters(), lr=args.lr)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min')

    # show the details of model and optimizer
    print('=============== MODEL ===============')
    print(model)
    print('')
    print('=============== OPTIMIZER ===============')
    print(optimizer)
    print('')

    max_epoch = (args.max_epoch or math.inf) + epoch

    while epoch < max_epoch and args.min_lr < optimizer.param_groups[0]['lr']:
        # training
        model.train()
        loss = step(epoch, args.mode, model, train_iter, criterion, optimizer,
                    device)

        # validation
        if args.valid is not None:
            model.eval()
            loss = step(epoch, args.mode, model, valid_iter, criterion,
                        optimizer, device)

        # saving model
        save_vars = {
            'epoch': epoch,
            'best_loss': loss if loss < best_loss else best_loss,
            'args': args,
            'weights': model.state_dict()
        }

        if loss < best_loss:
            best_loss = loss
            filename = os.path.join(args.savedir, 'checkpoint_best.pt')
            torch.save(save_vars, filename)
        if epoch % args.save_epoch == 0:
            filename = os.path.join(args.savedir, f'checkpoint_{epoch}.pt')
            torch.save(save_vars, filename)
        filename = os.path.join(args.savedir, 'checkpoint_last.pt')
        torch.save(save_vars, filename)

        # update
        scheduler.step(best_loss)
        epoch += 1
dataset = build_dataset(divine_comedy,
                        vocab,
                        idx2syl,
                        syl2idx,
                        seq_length=SEQ_LENGTH)

print("Corpus length: {} syllables".format(len(text_in_syls(divine_comedy))))
print("Vocab size:", len(vocab))

# Path where the vocab will be saved
logs_dir = os.path.join(working_dir, 'logs')
os.makedirs(logs_dir, exist_ok=True)
vocab_file = os.path.join(logs_dir, 'vocab.json')

save_vocab(vocab, idx2syl, syl2idx, vocab_file)

dataset_train, dataset_val = split_dataset(dataset)

dataset_train = dataset_train.batch(BATCH_SIZE, drop_remainder=True)
dataset_val = dataset_val.batch(BATCH_SIZE, drop_remainder=True)

model = build_model(
    vocab_size=len(vocab),
    seq_length=SEQ_LENGTH,
    embedding_dim=EMBEDDING_DIM,
    rnn_type=RNN_TYPE,
    rnn_units=RNN_UNITS,
    learning_rate=0.01,
)
elif args.model_architecture == 'fasttext':
    model = FT_gensim(size=args.size,
                      window=args.window,
                      min_count=args.min_count,
                      workers=args.threads,
                      sg=args.sg,
                      hs=args.hs,
                      negative=args.ns)

    # build the vocabulary
    model.build_vocab(sentences)

    # train the model
    model.train(sentences,
                epochs=model.epochs,
                total_examples=model.corpus_count,
                total_words=model.corpus_total_words)

elapsed = time.time()
logging.info(f'Training finished. Took {elapsed-start} s')
logging.info(f'Vocab size: {len(model.wv.vocab)}')
# Save model to disk
if args.format == 'gensim':
    model.wv.save(f'{MODELS_FOLDER /args.model_path}', separately=['vectors'])
elif args.format == 'w2v':
    model.wv.save_word2vec_format(f'{MODELS_FOLDER / args.model_path}.txt',
                                  binary=True)

# Save vocab to disk
save_vocab(model, args.vocab_path)
Beispiel #15
0
def train():
    args = hparams()

    print("Load data...")
    x_text = utils.load_data(args.text_file)
    vocab_dict = utils.build_vocab(x_text)
    x_data = utils.transform(x_text, vocab_dict)

    x_data = x_data[:-2]
    y_data = x_data[1:]

    # Split train/test set
    dev_sample_index = -1 * int(
        args.dev_sample_percentage * float(len(x_data)))
    x_train, x_dev = x_data[:dev_sample_index], x_data[dev_sample_index:]
    y_train, y_dev = y_data[:dev_sample_index], y_data[dev_sample_index:]
    print("Train/Dev split: {:d}/{:d}".format(len(x_train), len(x_dev)))

    utils.save_vocab(vocab_dict, args.vocab_file)
    del x_text, x_data, y_data

    #Training
    sess = tf.Session()
    with sess.as_default():
        rnn = RNNLM(vocab_size=args.vocab_size,
                    embedding_dim=args.embedding_dim,
                    rnn_size=args.rnn_size,
                    num_layers=args.num_layers,
                    batch_size=args.batch_size,
                    training=True)

        #Define train_op
        global_step = tf.Variable(0, name="global_step", trainable=False)
        learning_rate = tf.Variable(args.learning_rate,
                                    name="learning_rate",
                                    trainable=False)
        optimizer = tf.train.AdamOptimizer(learning_rate)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(rnn.loss, tvars),
                                          args.max_grad_norm)
        train_op = optimizer.apply_gradients(zip(grads, tvars),
                                             global_step=global_step)

        #Save model params
        checkpoint_dir = os.path.abspath(
            os.path.join(os.path.curdir, "checkpoints"))
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        checkpoint_file = os.path.join(checkpoint_dir, "model")

        #Save best model params
        dev_dir = os.path.abspath(os.path.join(os.path.curdir, "dev"))
        if not os.path.exists(dev_dir):
            os.makedirs(dev_dir)
        dev_file = os.path.join(dev_dir, "model")
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=20)

        dev_loss = 2e+50

        # Initialize all variables
        sess.run(tf.global_variables_initializer())

        # Training loop...
        for epoch in range(args.num_epochs):
            # Generate batches
            x_batches = utils.batch_iter(x_train, args.sequence_length,
                                         args.batch_size)
            y_batches = utils.batch_iter(y_train, args.sequence_length,
                                         args.batch_size)
            initial_state = sess.run(rnn.initial_state)
            for x_batch, y_batch in zip(x_batches, y_batches):
                feed_dict = {
                    rnn.input_data: x_batch,
                    rnn.targets: y_batch,
                    rnn.input_keep_prob: args.input_keep_prob,
                    rnn.output_keep_prob: args.output_keep_prob,
                    rnn.initial_state: initial_state
                }
                _, step, loss = sess.run([train_op, global_step, rnn.loss],
                                         feed_dict=feed_dict)
                time_str = datetime.datetime.now().isoformat()
                print("{}: step {}, loss {:g}".format(time_str, step, loss))

                #Evaluate on dev set
                current_step = tf.train.global_step(sess, global_step)
                if current_step % args.checkpoint_steps == 0:
                    saver.save(sess, checkpoint_file, global_step=current_step)
                    print("Save model to %s" % checkpoint_file)
                if current_step % args.evaluate_steps == 0:
                    x_dev_batches = utils.batch_iter(x_dev,
                                                     args.sequence_length,
                                                     args.batch_size)
                    y_dev_batches = utils.batch_iter(y_dev,
                                                     args.sequence_length,
                                                     args.batch_size)
                    dev_losses = 0.0
                    i = 0
                    for x_dev_batch, y_dev_batch in zip(
                            x_dev_batches, y_dev_batches):
                        dev_feed_dict = {
                            rnn.input_data: x_dev_batch,
                            rnn.targets: y_dev_batch,
                            rnn.input_keep_prob: 1.0,
                            rnn.output_keep_prob: 1.0,
                            rnn.initial_state: initial_state
                        }
                        step, loss = sess.run([global_step, rnn.loss],
                                              feed_dict=dev_feed_dict)
                        time_str = datetime.datetime.now().isoformat()
                        dev_losses += loss
                        i += 1
                    loss = dev_losses / i
                    print("Evaluate on dev set:")
                    print("{}: step {}, loss {:g}".format(
                        time_str, step, loss))

                    if dev_loss > loss:
                        dev_loss = loss
                        saver.save(sess, dev_file, global_step=current_step)
                        print("Save better model to %s" % dev_file)
##############################

vocab_verse, idx2syl_verse, syl2idx_verse = build_vocab_verse(divine_comedy)

dataset_verse = build_dataset_verse(divine_comedy,
                                    vocab_verse,
                                    idx2syl_verse,
                                    syl2idx_verse,
                                    seq_length=SEQ_LENGTH)

# Path where the vocab will be saved
logs_dir = os.path.join(working_dir, 'logs')
os.makedirs(logs_dir, exist_ok=True)
vocab_file_verse = os.path.join(logs_dir, 'vocab_verse.json')

save_vocab(vocab_verse, idx2syl_verse, syl2idx_verse, vocab_file_verse)

dataset_train_verse, dataset_val_verse = split_dataset(dataset_verse)

dataset_train_verse = dataset_train_verse.batch(BATCH_SIZE,
                                                drop_remainder=True)
dataset_val_verse = dataset_val_verse.batch(BATCH_SIZE, drop_remainder=True)

model_verse = build_model(
    name='VerseNetwork',
    vocab_size=len(vocab_verse),
    seq_length=SEQ_LENGTH,
    embedding_dim=EMBEDDING_DIM,
    rnn_type=RNN_TYPE,
    rnn_units=RNN_UNITS,
    learning_rate=0.01,
Beispiel #17
0
dataset = build_dataset(divine_comedy,
                        vocab,
                        idx2char,
                        char2idx,
                        seq_length=SEQ_LENGTH)

print("Corpus length: {} characters".format(len(divine_comedy)))
print("Vocab size:", len(vocab))

# Path where the vocab will be saved
logs_dir = os.path.join(working_dir, 'logs')
os.makedirs(logs_dir, exist_ok=True)
vocab_file = os.path.join(logs_dir, 'vocab.json')

save_vocab(vocab, idx2char, char2idx, vocab_file)

dataset_train, dataset_val = split_dataset(dataset)

dataset_train = dataset_train.batch(BATCH_SIZE, drop_remainder=True)
dataset_val = dataset_val.batch(BATCH_SIZE, drop_remainder=True)

model = build_model(
    vocab_size=len(vocab),
    seq_length=SEQ_LENGTH,
    embedding_dim=EMBEDDING_DIM,
    rnn_type=RNN_TYPE,
    rnn_units=RNN_UNITS,
    learning_rate=0.01,
)
# EPOCHS = 200
# SEQ_LENGTH = 75
# EMBEDDING_DIM = 256
# RNN_UNITS = 1024
# RNN_TYPE = 'gru'

##############################

vocab, idx2word, word2idx = build_vocab(divine_comedy)

# Path where the vocab will be saved
logs_dir = os.path.join(working_dir, 'logs')
os.makedirs(logs_dir, exist_ok=True)
vocab_file = os.path.join(logs_dir, 'vocab.json')

save_vocab(vocab, idx2word, word2idx, vocab_file)

dataset = build_dataset(divine_comedy,
                        vocab,
                        idx2word,
                        word2idx,
                        seq_length=SEQ_LENGTH)

print("Corpus length: {} words".format(len(divine_comedy)))
print("Vocab size:", len(vocab))

dataset_train, dataset_val = split_dataset(dataset)

#for s in dataset_train.take(1).as_numpy_iterator():
#    print(s)