Beispiel #1
0
def main(args):
    datamodule = LanguageDataModule(root=args.dataset_path,
                                    languages=args.languages,
                                    batch_size=args.batch_size,
                                    num_workers=args.num_workers)

    model = LanguageModel(
        # layers=14,#10
        #  blocks=1,#4
        skip_channels=32,  #256 
        end_channels=32,  #256
        # uncomment for fast debug network
    )

    ckpt = torch.load(args.ckpt_path)
    model.load_state_dict(ckpt['state_dict'])

    trainer = pl.Trainer(

        # comment to run on cpu for local testing
        gpus=args.gpus,
        auto_select_gpus=True,
        # distributed_backend='ddp',
        benchmark=True,
        ## -------
        terminate_on_nan=True,
    )
    datamodule.setup()

    # trainer.fit(model, datamodule)

    results = trainer.test(model, datamodule.test_dataloader())
Beispiel #2
0
def rnn_main(dataset):
    model = LanguageModel(dataset.vocab).to(_flags.device())

    def sample():
        return dataset.sample_train(aug_ratio=FLAGS.aug_ratio)

    def score_utts(utts):
        fake = [((), utt) for utt in utts]
        batch = make_batch(fake, model.vocab, staged=False)
        mean = model(None, batch.out_data, None, None).item()
        tot = mean * sum(len(utt) - 1 for utt in utts)
        return tot

    def callback(i_epoch):
        model.eval()
        final = i_epoch == FLAGS.n_epochs - 1
        with hlog.task("eval_val", timer=False):
            val_acc = evaluate(score_utts, dataset.get_val(), dataset)
        if FLAGS.TEST and (final or FLAGS.test_curve):
            with hlog.task("eval_test", timer=False):
                evaluate(score_utts, dataset.get_test(), dataset)
        if (i_epoch + 1) % FLAGS.n_checkpoint == 0:
            torch.save(
                model.state_dict(),
                os.path.join(FLAGS.model_dir, "model.%05d.chk" % i_epoch))
        return val_acc

    train(dataset, model, sample, callback, staged=False)
def run(arguments) -> None:
    hyperparameters = LanguageModel.get_default_hyperparameters()
    hyperparameters["run_id"] = make_run_id(arguments)
    max_epochs = int(arguments.get("--max-num-epochs"))
    patience = int(arguments.get("--patience"))
    max_num_files = arguments.get("--max-num-files")

    # override hyperparams if flag is passed
    hypers_override = arguments.get("--hypers-override")
    if hypers_override is not None:
        hyperparameters.update(json.loads(hypers_override))

    save_model_dir = args["SAVE_DIR"]
    os.makedirs(save_model_dir, exist_ok=True)
    save_file = os.path.join(save_model_dir,
                             f"{hyperparameters['run_id']}_best_model.bin")

    print("Loading data ...")
    vocab = build_vocab_from_data_dir(
        data_dir=args["TRAIN_DATA_DIR"],
        vocab_size=hyperparameters["max_vocab_size"],
        max_num_files=max_num_files,
    )
    print(f"  Built vocabulary of {len(vocab)} entries.")
    train_data = load_data_from_dir(
        vocab,
        length=hyperparameters["max_seq_length"],
        data_dir=args["TRAIN_DATA_DIR"],
        max_num_files=max_num_files,
    )
    print(
        f"  Loaded {train_data.shape[0]} training samples from {args['TRAIN_DATA_DIR']}."
    )
    valid_data = load_data_from_dir(
        vocab,
        length=hyperparameters["max_seq_length"],
        data_dir=args["VALID_DATA_DIR"],
        max_num_files=max_num_files,
    )
    print(
        f"  Loaded {valid_data.shape[0]} validation samples from {args['VALID_DATA_DIR']}."
    )
    model = LanguageModel(hyperparameters, vocab)
    model.build(([None, hyperparameters["max_seq_length"]]))
    print(
        f"Constructed model, using the following hyperparameters: {json.dumps(hyperparameters)}"
    )

    train(
        model,
        train_data,
        valid_data,
        batch_size=hyperparameters["batch_size"],
        max_epochs=max_epochs,
        patience=patience,
        save_file=save_file,
    )
Beispiel #4
0
    def generate_program_desc(self, do_test=False):
        """
        generate the paddle program desc
        """
        with fluid.program_guard(self.main_program_, self.startup_program_):
            self.input_model_ = LanguageModel()
            model_configs = self.trainer_config
            self.input_model_.build_model(model_configs)

            optimizer = fluid.optimizer.SGD(
                learning_rate=self.trainer_config["lr"],
                grad_clip=fluid.clip.GradientClipByGlobalNorm(
                    clip_norm=self.trainer_config["max_grad_norm"]))
            optimizer.minimize(self.input_model_.get_model_loss())

        self.main_program_desc_ = self.main_program_.desc.serialize_to_string()
        self.startup_program_desc_ = self.startup_program_.desc.serialize_to_string(
        )
        self.update_trainer_configs("loss_name",
                                    self.input_model_.get_model_loss_name())
        self.update_trainer_configs(
            "input_names",
            self.input_model_.get_model_input_names(),
        )
        self.update_trainer_configs(
            "target_names",
            self.input_model_.get_target_names(),
        )
        self.update_trainer_configs(
            "metrics",
            self.input_model_.get_model_metrics(),
        )
        self.update_trainer_configs("show_metric", True)
        self.update_trainer_configs("max_training_steps", "inf")
        self.update_trainer_configs("shuffle", False)
        self.update_trainer_configs("main_program_desc",
                                    self.main_program_desc_)
        self.update_trainer_configs("startup_program_desc",
                                    self.startup_program_desc_)

        if do_test:
            input_names = self.input_model_.get_model_input_names()
            target_var_names = self.input_model_.get_target_names()
            self.infer_program_ = self.main_program_._prune_with_input(
                feeded_var_names=input_names, targets=target_var_names)
            self.infer_program_ = self.infer_program_._inference_optimize(
                prune_read_op=True)
            fluid.io.prepend_feed_ops(self.infer_program_, input_names)
            fluid.io.append_fetch_ops(self.infer_program_, target_var_names)
            self.infer_program_.desc._set_version()
            fluid.core.save_op_compatible_info(self.infer_program_.desc)
            self.infer_program_desc_ = self.infer_program_.desc.serialize_to_string(
            )
            self.update_trainer_configs("infer_program_desc",
                                        self.infer_program_desc_)
Beispiel #5
0
def main(_):
    # Load configuration.
    with open(FLAGS.config, 'r') as f:
        config = yaml.load(f)

    # Initialize CoNLL dataset.
    dataset = CoNLLDataset(fname=config['data']['train'], target='lm')

    # Initialize model.
    language_model = LanguageModel(
        vocab_size=len(dataset.token_vocab),
        embedding_dim=config['model']['embedding_dim'],
        hidden_size=config['model']['hidden_size'],
        num_layers=config['model']['num_layers'])
    if torch.cuda.is_available():
        language_model = language_model.cuda()

    # Initialize loss function. NOTE: Manually setting weight of padding to 0.
    weight = torch.ones(len(dataset.token_vocab))
    weight[0] = 0
    if torch.cuda.is_available():
        weight = weight.cuda()
    loss_function = torch.nn.NLLLoss(weight)
    optimizer = torch.optim.Adam(language_model.parameters())

    # Main training loop.
    data_loader = DataLoader(dataset,
                             batch_size=config['training']['batch_size'],
                             shuffle=True,
                             collate_fn=collate_annotations)
    losses = []
    i = 0
    for epoch in range(config['training']['num_epochs']):
        for batch in data_loader:
            inputs, targets, lengths = batch
            optimizer.zero_grad()
            outputs, _ = language_model(inputs, lengths=lengths)

            outputs = outputs.view(-1, len(dataset.token_vocab))
            targets = targets.view(-1)

            loss = loss_function(outputs, targets)
            loss.backward()
            optimizer.step()

            losses.append(loss.data[0])
            if (i % 100) == 0:
                average_loss = np.mean(losses)
                losses = []
                print('Iteration %i - Loss: %0.6f' % (i, average_loss),
                      end='\r')
            if (i % 1000) == 0:
                torch.save(language_model, config['data']['checkpoint'])
            i += 1
    torch.save(language_model, config['data']['checkpoint'])
Beispiel #6
0
def graph(params):

    model = LanguageModel(
        params.vocab.size(),
        params.embed_size,
        params.hidden_size,
        params.nlayers,
        dropout=params.dropout,
        cell=params.cell,
    )

    loss = torch.nn.CrossEntropyLoss()

    return model, loss
Beispiel #7
0
def model_load(path, model=None, optimizer=None):
    config = LMConfig(os.path.join(path, 'config.json'))
    if model is None:
        model_to_load = LanguageModel(config)
    else:
        model_to_load = get_model(model)
        model_to_load.__init__(config)
    model_state_dict = torch.load(open(os.path.join(path, 'model.pt'), 'rb'),
                                  map_location=lambda s, l: s)
    model_to_load.load_state_dict(model_state_dict)
    if optimizer:
        optimizer_state_dict = torch.load(open(
            os.path.join(path, 'optimizer.pt'), 'rb'),
                                          map_location=lambda s, l: s)
        optimizer.load_state_dict(optimizer_state_dict)
    return model_to_load
Beispiel #8
0
def freestyle(loc):  # TODO

    # load data
    model_dir = Path(loc)
    settings = pickle.load(open(model_dir / 'settings.pkl', 'rb'))
    print(settings)

    # settings
    cell = settings['cell']
    hidden_size = settings['hidden_size']
    token = settings['token']
    small = settings['small']
    how_many = 100

    # load the models
    vocab = generate.get_vocab(token, small)
    if token == 'word':
        emb = generate.get_embedding('word2vec')
        input_size = emb.vectors.shape[1]
        output_size = emb.vectors.shape[0]
    elif token == 'character':
        emb = None
        input_size = vocab.size
        output_size = vocab.size
    fnames = os.listdir(model_dir / 'checkpoints')
    fname = fnames[-1]

    # load the model
    model = LanguageModel(cell, input_size, hidden_size, output_size)
    model.load_state_dict(torch.load(model_dir / 'checkpoints' / fname))
    model.eval()

    # monitor
    sents = [
        'The Standard ', 'non-abelian', 'silicon pixel detector',
        'estimate the', '[23] ATLAS'
    ]
    temperatures = [0.01 + 0.1 * i for i in range(11)]
    eval_stream = model_dir / 'evaluate_stream.txt'

    for temperature in temperatures:
        txt = '\nTemperature = {}'.format(temperature)
        utils.report(txt, eval_stream)
        for sent in sents:
            txt = generate.compose(model, vocab, emb, sent, temperature,
                                   how_many)
            utils.report(txt, eval_stream)
Beispiel #9
0
def build_LM(in_file: str) -> LanguageModel:
    """
    build language models for each label
    each line in in_file contains a label and a string separated by a space
    """
    print('building language models...')
    # This is an empty method
    # Pls implement your code in below

    lm = LanguageModel()

    with open(in_file, encoding="utf8") as in_file_lines:
        for line in in_file_lines:
            (language, l) = line.split(" ", 1)
            for gram in create_grams(tokenize(l, NGRAM_SIZE - 1), NGRAM_SIZE):
                lm.add_gram(gram, language)

    return lm
Beispiel #10
0
def main(args):
    logger.info(f"Args: {json.dumps(args.__dict__, indent=2, sort_keys=True)}")

    spm_path = os.path.join('spm', args.spm, "spm.model")
    args.sample = parse_sample_options(args.sample)
    logger.info(f"Loading tokenizer from {spm_path}")
    tokenizer = Tokenizer(spm_path)
    args.ntoken = ntoken = len(tokenizer)
    logger.info(f"  Vocabulary size: {ntoken}")

    logger.info("Reading dataset")
    data = {}
    for x in ['train', 'valid', 'test']:
        data[x] = read_data(os.path.join(args.data_dir, f"{x}.query.txt"),
                            min_len=args.min_len)
        logger.info(f"  Number of {x:>5s} data: {len(data[x]):8d}")

    logger.info("Preparing model and optimizer")
    config = LMConfig(ntoken, args.ninp, args.nhid, args.nlayers,
                      args.dropouti, args.dropoutr, args.dropouth,
                      args.dropouto)
    model = LanguageModel(config).to(device)
    params = get_params(model)
    logger.info(
        f"  Number of model parameters: {sum(p.numel() for p in params)}")
    optimizer = torch.optim.Adam(params)

    if args.resume:
        logger.info(f"Loading model from {args.resume}")
        model_load(args.resume, model, optimizer)
        model = model.to(device)

    if n_gpu > 1:
        logger.info(f"Making model as data parallel")
        model = torch.nn.DataParallel(model, dim=1)

    train(model, optimizer, tokenizer, data['train'], data['valid'], args)

    test(model, tokenizer, data['test'], args)
Beispiel #11
0
def plot_losses(loc):

    # load data
    model_dir = Path(loc)
    settings = pickle.load(open(model_dir / 'settings.pkl', 'rb'))

    # settings
    cell = settings['cell']
    hidden_size = settings['hidden_size']
    token = settings['token']
    small = settings['small']
    max_len = settings['max_len']
    n_epochs = settings['n_epochs']
    n_saves = settings['n_saves']
    criterion = nn.CrossEntropyLoss()

    # load the models
    models = []
    vocab = generate.get_vocab(token, small)
    if token == 'word':
        emb = generate.get_embedding('word2vec')
        input_size = emb.vectors.shape[1]
        output_size = emb.vectors.shape[0]
    elif token == 'character':
        emb = None
        input_size = vocab.size
        output_size = vocab.size

    for fname in os.listdir(model_dir / 'checkpoints'):
        model = LanguageModel(cell, input_size, hidden_size, output_size)
        model.load_state_dict(torch.load(model_dir / 'checkpoints' / fname))
        model.eval()
        models.append(model)

    # prepare training and validation sets
    N = 10000
    splits = ['train', 'valid']
    gens = {
        split: generate.generate(split,
                                 token=token,
                                 max_len=max_len,
                                 small=small,
                                 batch_size=N)
        for split in splits
    }
    batch, labels = {}, {}
    for split in splits:
        for b, l in gens[split]:

            # one hot encode
            if token == 'character':
                b = generate.one_hot_encode(b, vocab)
            # or embed
            elif token == 'word':
                b = generate.w2v_encode(b, emb, vocab)

            batch[split], labels[split] = torch.Tensor(b), torch.Tensor(
                l).long()
            break

    # evaluate the models
    loss = {split: [] for split in splits}
    acc = {split: [] for split in splits}
    for i, model in enumerate(models):
        t0 = time.time()
        print(i)
        for split in splits:
            # loss
            outputs = model(batch[split])
            l = criterion(outputs, labels[split])
            loss[split].append(float(l))
            # accuracy
            _, preds = torch.max(outputs, 1)
            a = sum(preds == labels[split]) / float(N)
            acc[split].append(float(a))
        print('{:2.2f}s'.format(time.time() - t0))

    for split in splits:
        with open(model_dir / 'best_{}_acc.txt'.format(split), 'w') as handle:
            best = max(acc[split])
            handle.write('{}\n'.format(best))

    # plot both quantities
    for quantity, description in zip([loss, acc], ['Loss', 'Accuracy']):
        fig, ax = plt.subplots()
        for split in splits:
            xs = (1 + np.arange(len(quantity[split]))) / n_saves
            ax.plot(xs, quantity[split], label=split)
        ax.set_xlabel('Training epoch')
        if n_epochs > 1:
            ax.set_xlabel('Epoch')
        ax.set_ylabel(description)
        upper = ax.get_ylim()[1] if description == 'Loss' else 1
        ax.set_ylim(0, upper)
        ax.set_xlim(0, ax.get_xlim()[1])
        ax.set_title(model_dir.name, fontsize=7)
        ax.legend()
        ax.grid(alpha=0.5, which='both')
        plt.savefig(model_dir / '{}.pdf'.format(description))
Beispiel #12
0
    test_input, test_label = get_batch(test_file_path,
                                       word_dict,
                                       batch_size=args.batch_size,
                                       bptt=args.bptt)
    with open(test_pkl_path, 'wb') as f:
        pickle.dump({'data': test_input, 'label': test_label}, f)

with open(train_pkl_path, 'rb') as f:
    train_data = pickle.load(f)
with open(test_pkl_path, 'rb') as f:
    test_data = pickle.load(f)

model = LanguageModel(dict_size,
                      args.hidden_size,
                      args.hidden_size,
                      n_layer=1,
                      drop_rate=args.drop_rate,
                      adaptive_softmax=with_adaptive,
                      cutoff=cutoff_list)
model  #.cuda()
optimizer = optim.Adagrad(model.parameters(),
                          lr=args.learning_rate,
                          lr_decay=args.learning_rate_decay,
                          weight_decay=args.weight_decay)

if with_adaptive:
    print('Use adaptive softmax.')
    criterion = AdaptiveLoss(cutoff_list)
else:
    print('Use common softmax.')
    criterion = nn.CrossEntropyLoss()
Beispiel #13
0

input = data['input']
label = data['label']

vocab = len(data['worddic'])

if args.model == 'adasoft':
    adasoft = True

elif args.model == 'linear':
    adasoft = False

model = LanguageModel(vocab,
                      512,
                      512,
                      1,
                      adaptive_softmax=adasoft,
                      cutoff=[2000, 10000])
model.cuda()
optimizer = optim.Adagrad(model.parameters(),
                          lr=0.1,
                          lr_decay=1e-5,
                          weight_decay=1e-5)

if adasoft:
    criterion = AdaptiveLoss([2000, 10000, vocab + 1])

else:
    criterion = nn.CrossEntropyLoss()

Beispiel #14
0
def train(opt):

    # Read preprocessed data
    print_line()
    print('Loading training data ...')
    check_name = re.compile('.*\.prep\.train\.pt')
    assert os.path.exists(
        opt.train_data) or check_name.match(opt.train_data) is None
    train_dataset = torch.load(opt.train_data)
    train_dataset.set_batch_size(opt.batch_size)
    print('Done.')

    print_line()
    print('Loading validation data ...')
    check_name = re.compile('.*\.prep\.val\.pt')
    assert os.path.exists(
        opt.val_data) or check_name.match(opt.val_data) is None
    val_dataset = torch.load(opt.val_data)
    val_dataset.set_batch_size(opt.batch_size)
    print('Done.')

    # Build / load  Model
    if opt.model_reload is None:
        print_line()
        print('Build new model...')

        model = LanguageModel(train_dataset.num_vocb,
                              dim_word=opt.dim_word,
                              dim_rnn=opt.dim_rnn,
                              num_layers=opt.num_layers,
                              dropout_rate=opt.dropout_rate)

        model.dictionary = train_dataset.dictionary
        print('Done')
        train_dataset.describe_dataset()
        val_dataset.describe_dataset()

    else:
        print_line()
        print('Loading existing model...')
        model = torch.load(opt.model_reload)
        print('done')
        train_dataset.change_dict(model.dictionary)
        val_dataset.change_dict(model.dictionary)

    model_start_epoch = model.train_info['epoch idx'] - 1
    model_start_batch = model.train_info['batch idx'] - 1

    # Use GPU / CPU
    print_line()
    if opt.cuda:
        model.cuda()
        print('Using GPU %d' % torch.cuda.current_device())
    else:
        print('Using CPU')

    # Crterion, mask padding
    criterion_weight = torch.ones(train_dataset.num_vocb + 1)
    criterion_weight[const.PAD] = 0
    criterion = nn.CrossEntropyLoss(weight=criterion_weight,
                                    size_average=False)
    if opt.cuda:
        criterion = criterion.cuda()

    # Optimizer
    lr = opt.lr
    optimizer = getattr(optim, opt.optimizer)(model.parameters(), lr=lr)

    if (model_start_epoch > opt.epoch):
        print(
            'This model has already trained more than %d epoch, add epoch parameter is you want to continue'
            % (opt.epoch + 1))
        return

    print_line()
    print('')
    if opt.model_reload is None:
        print('Start training new model, will go through %d epoch' % opt.epoch)
    else:
        print('Continue existing model, from epoch %d, batch %d to epoch %d' %
              (model_start_epoch, model_start_batch, opt.epoch))
    print('')

    best_model = model.train_info

    if opt.save_freq == 0:
        opt.save_freq = train_dataset.num_batch - 1

    # Train
    model.train()

    for epoch_idx in range(model_start_epoch, opt.epoch):
        # New epoch
        acc_loss = 0
        acc_count = 0
        start_time = time.time()
        train_dataset.shuffle()

        print_line()
        print('Start epoch %d, learning rate %f ' % (epoch_idx + 1, lr))
        print_line('-')
        epoch_start_time = start_time

        # If load model and continue training
        if epoch_idx == model_start_epoch and model_start_batch > 0:
            start_batch = model_start_batch
        else:
            start_batch = 0

        for batch_idx in range(start_batch, train_dataset.num_batch):
            # Generate batch data
            batch_data, batch_lengths, target_words = train_dataset[batch_idx]

            if opt.cuda:
                batch_data = batch_data.cuda()
                batch_lengths = batch_lengths.cuda()
                target_words = target_words.cuda()

            batch_data = Variable(batch_data, requires_grad=False)
            batch_lengths = Variable(batch_lengths, requires_grad=False)
            target_words = Variable(target_words, requires_grad=False)

            optimizer.zero_grad()

            # Forward
            output_flat = model.forward(batch_data, batch_lengths)

            # Caculate loss
            loss = criterion(output_flat, target_words.view(-1))

            # Backward
            loss.backward()

            # Prevent gradient explode
            torch.nn.utils.clip_grad_norm(model.parameters(), opt.clip)

            # Update parameters
            optimizer.step()

            # Accumulate loss
            acc_loss += loss.data
            acc_count += batch_lengths.data.sum()

            # Display progress
            if batch_idx % opt.display_freq == 0:
                average_loss = acc_loss[0] / acc_count.item()
                print(
                    'Epoch : %d, Batch : %d / %d, Loss : %f, Perplexity : %f, Time : %f'
                    % (epoch_idx + 1, batch_idx,
                       train_dataset.num_batch, average_loss,
                       math.exp(average_loss), time.time() - start_time))

                acc_loss = 0
                acc_count = 0
                start_time = time.time()

            #Save and validate if it is neccesary
            if (1 + batch_idx) % opt.save_freq == 0:

                print_line('-')
                print('Pause training for save and validate.')

                model.eval()
                val_loss = evaluate(model=model,
                                    eval_dataset=val_dataset,
                                    cuda=opt.cuda,
                                    criterion=criterion)
                model.train()

                print('Validation Loss : %f' % val_loss)
                print('Validation Perplexity : %f' % math.exp(val_loss))

                model_savename = opt.model_name + '-e_' + str(
                    epoch_idx +
                    1) + '-b_' + str(batch_idx + 1) + '-ppl_' + str(
                        int(math.exp(val_loss))) + '.pt'

                model.val_loss = val_loss
                model.val_ppl = math.exp(val_loss)
                model.epoch_idx = epoch_idx + 1
                model.batch_idx = batch_idx + 1

                model.train_info['val loss'] = val_loss
                model.train_info['train loss'] = math.exp(val_loss)
                model.train_info['epoch idx'] = epoch_idx + 1
                model.train_info['batch idx'] = batch_idx + 1
                model.train_info['val ppl'] = math.exp(model.val_loss)
                model.train_info['save name'] = model_savename

                try:
                    torch.save(model, model_savename)
                except:
                    print('Failed to save model!')

                if model.val_loss < best_model['val loss']:
                    print_line('-')
                    print('New best model on validation set')
                    best_model = model.train_info
                    shutil.copy2(best_model['name'],
                                 opt.model_name + '.best.pt')

                print_line('-')
                print('Save model at %s' % (model_savename))
                print_line('-')
                print('Continue Training...')

        print_line('-')
        print('Epoch %d finished, spend %d s' %
              (epoch_idx + 1, time.time() - epoch_start_time))

        # Update lr if needed
        lr *= opt.lr_decay
        optimizer = getattr(optim, opt.optimizer)(model.parameters(), lr=lr)

    # Finish training
    print_line()
    print(' ')
    print('Finish training %d epochs!' % opt.epoch)
    print(' ')
    print_line()
    print('Best model:')
    print('Epoch : %d, Batch : %d ,Loss : %f, Perplexity : %f' %
          (best_model['epoch idx'], best_model['batch idx'],
           best_model['val loss'], best_model['val ppl']))
    print_line('-')

    print('Save best model at %s' % (opt.model_name + '.best.pt'))
    shutil.copy2(best_model['name'], opt.model_name + '.best.pt')
    print_line()
Beispiel #15
0

def detach_hidden(h):
    """Detach hidden states from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    return tuple(detach_hidden(v) for v in h)


torch.backends.cudnn.benchmark = True
torch.manual_seed(0)
np.random.seed(0)

labels = Labels()

model = LanguageModel(128, 512, 256, len(labels), n_layers=3, dropout=0.3)
model.cuda()

bptt = 8
batch_size = 64

root = '/open-stt-e2e/data/'

train = [
    root + 'asr_public_phone_calls_1.csv',
    root + 'asr_public_phone_calls_2_aa.csv',
    root + 'asr_public_phone_calls_2_ab.csv',
    root + 'public_youtube1120_aa.csv', root + 'public_youtube1120_ab.csv',
    root + 'public_youtube1120_ac.csv', root + 'public_youtube1120_hq.csv',
    root + 'public_youtube700_aa.csv', root + 'public_youtube700_ab.csv'
]
Beispiel #16
0
    trainSet, vocab = creatDataSet('./data', 'ptb.train.txt')
    testSet, _ = creatDataSet('./data', 'ptb.test.txt')
    validSet, _ = creatDataSet('./data', 'ptb.valid.txt')

    word2idx, idx2word = word2index(vocab)

    ### Parameters Set ##########
    VOCAB_SIZE = len(word2idx)
    EMBEDDING_SIZE = 128
    HIDDEN_SIZE = 1024
    N_LAYERS = 1
    DOPROUT_P = 0.5
    BATCH_SIZE = 20
    SEQ_LENGTH = 30
    EPOCH = 40
    LEARNING_RATE = 0.01
    #############################

    train_data = batchify(prepare_sequence(trainSet, word2idx), BATCH_SIZE)
    test_data = batchify(prepare_sequence(testSet, word2idx), BATCH_SIZE)
    valid_data = batchify(prepare_sequence(validSet, word2idx), BATCH_SIZE)

    model = LanguageModel(VOCAB_SIZE, EMBEDDING_SIZE, HIDDEN_SIZE, N_LAYERS,
                          DOPROUT_P).to(device)
    model.weight_init()
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

    trainModel(model, train_data, valid_data, BATCH_SIZE, SEQ_LENGTH, EPOCH)
    testModel(model, test_data, BATCH_SIZE, SEQ_LENGTH)
Beispiel #17
0
                    lr=0.0001,
                    teacher_forcing_ratio=1.0,
                    seed=1,
                    max_len=428,
                    worker_num=1)

    torch.manual_seed(config.seed)
    torch.cuda.manual_seed_all(config.seed)
    cuda = config.use_cuda and torch.cuda.is_available()
    device = torch.device('cuda' if cuda else 'cpu')

    model = LanguageModel(n_class=len(char2id),
                          n_layers=config.n_layers,
                          rnn_cell='lstm',
                          hidden_size=config.hidden_size,
                          dropout_p=config.dropout_p,
                          max_length=config.max_len,
                          sos_id=SOS_token,
                          eos_id=EOS_token,
                          device=device)
    model.flatten_parameters()
    model = nn.DataParallel(model).to(device)

    for param in model.parameters():
        param.data.uniform_(-0.08, 0.08)

    # Prepare loss
    weight = torch.ones(len(char2id)).to(device)
    perplexity = Perplexity(weight, PAD_token, device)
    optimizer = optim.Adam(model.module.parameters(), lr=config.lr)
Beispiel #18
0
from model import LanguageModel
import argparse

new = LanguageModel()

parser = argparse.ArgumentParser()

parser.add_argument("--length", default=100, type=int, help="text length")
parser.add_argument("--file",
                    default="bred.txt",
                    type=str,
                    help="file in which the text will be written")
parser.add_argument("--print",
                    default=False,
                    type=bool,
                    help="display text on screen or not")

args = parser.parse_args()

result = new.generate(args.length, args.file)

if args.print and result == 'Successfully':
    with open(args.file, 'r') as file:
        print(file.read())
else:
    print(result)
Beispiel #19
0
def plot_switch_prob(loc):

    # load settings
    model_dir = Path(loc)
    settings = pickle.load(open(model_dir / 'settings.pkl', 'rb'))
    cell = settings['cell']
    hidden_size = settings['hidden_size']
    token = settings['token']
    small = settings['small']
    max_len = settings['max_len']

    # load the final model
    vocab = generate.get_vocab(token, small)
    if token == 'word':
        emb = generate.get_embedding('word2vec')
        input_size = emb.vectors.shape[1]
        output_size = emb.vectors.shape[0]
    elif token == 'character':
        emb = None
        input_size = vocab.size
        output_size = vocab.size

    fnames = os.listdir(model_dir / 'checkpoints')
    fname = fnames[-1]

    # load the model
    model = LanguageModel(cell, input_size, hidden_size, output_size)
    model.load_state_dict(torch.load(model_dir / 'checkpoints' / fname))
    model.eval()

    # prepare the base and replacement batch
    N = 100
    gen = generate.generate('valid',
                            token=token,
                            max_len=max_len,
                            small=small,
                            batch_size=N)
    base_batch, _ = next(gen)
    repl_batch, _ = next(gen)

    # compute the average KL divs over the batch
    depths = [i for i in range(max_len)]
    switch_probs = [
        compute_switch_prob(model, base_batch, repl_batch, keep_depth, vocab,
                            emb) for keep_depth in depths
    ]

    # make the plot
    fig, ax = plt.subplots()
    ax.plot(depths, switch_probs, 'tomato')
    ax.plot(depths, [0.01] * len(depths), 'k')
    ax.set_yscale('log')
    ax.set_ylim(0.001, 1)
    ax.set_xlim(0, max_len)
    ax.set_title('Probability of switching predicted character\n{}'.format(
        model_dir.name),
                 fontsize=7)
    ax.set_xlabel('sequence keep-depth')
    ax.set_ylabel('Probabillity')
    ax.grid()
    plt.savefig(model_dir / 'SwitchProbability.pdf')
Beispiel #20
0
dataset.save(dataset_specific_info)

params = {}

#take account of the 0 token for padding
params['vocab_size'] = dataset.vocab_size + 1
params['num_classes'] = dataset.vocab_size
params['batch_size'] = batch_size
params['valid_batch_size'] = valid_batch_size
params['seq_len'] = seq_len
params['hidden_dim'] = hidden_size
params['num_layers'] = num_layers
params['embed_size'] = embed_size
params['directoryOutLogs'] = directoryOutLogs

model = LanguageModel(params)
model.compile()
eval_softmax = 5

total_time_training = 0
total_time_valid = 0
loss_list = ''
perp_list = ''
wps_list = ''

time_per_batch = ''
time_per_epoch = ''

for epoch in range(num_epochs):
    dataset.set_data_dir(data_dir)
    dataset.set_batch_size(batch_size)
Beispiel #21
0
def detach_hidden(h):
    """Detach hidden states from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    return tuple(detach_hidden(v) for v in h)


torch.backends.cudnn.benchmark = True
torch.manual_seed(0)
np.random.seed(0)

labels = Labels()
num_labels = len(labels)

model = LanguageModel(128, 512, 256, num_labels, n_layers=3, dropout=0.3)
model.cuda()

bptt = 8
batch_size = 32

train = [
    '/media/lytic/STORE/ru_open_stt_wav/text/public_youtube1120_hq.txt',
    '/media/lytic/STORE/ru_open_stt_wav/text/public_youtube1120.txt',
    '/media/lytic/STORE/ru_open_stt_wav/text/public_youtube700.txt'
]

test = [
    '/media/lytic/STORE/ru_open_stt_wav/text/asr_calls_2_val.txt',
    '/media/lytic/STORE/ru_open_stt_wav/text/buriy_audiobooks_2_val.txt',
    '/media/lytic/STORE/ru_open_stt_wav/text/public_youtube700_val.txt'
Beispiel #22
0
EPOCH = 40‘
RESCHEDULED = False

train_data, vocab_train = creatDataSet('./data', 'ptb.train.txt')
valid_data, _ = creatDataSet('./data', 'ptb.valid.txt')
test_data, _ = creatDataSet('./data', 'ptb.test.txt')

vocab = list(set(vocab_train))
word2idx, idx2word = word2index(vocab)

trainSet = batchify(prepare_seq(train_data, word2idx), BATCH_SIZE)
testSet = batchify(prepare_seq(test_data, word2idx), BATCH_SIZE//2)
validSet = batchify(prepare_seq(valid_data, word2idx), BATCH_SIZE//2)


model = LanguageModel(len(word2idx), EMBED_SIZE, HIDDEN_SIZE, NUM_LAYER, 0.5).to(device)
model.init_weight()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = LR)

def trainModel(trainSet, validSet):
    for epoch in range(EPOCH):
        total_loss = 0
        losses = []
        hidden = model.init_hidden(BATCH_SIZE)
        total = ceil((trainSet.size(1) - SEQ_LENGTH) / SEQ_LENGTH)
        model.train()
        for i, batch in enumerate(getBatch(trainSet, SEQ_LENGTH)):
            view_bar(i, total, epoch + 1, EPOCH)
            inputs, targets = batch
            hidden = model.detach_hidden(hidden)
Beispiel #23
0
def train(settings, model_dir):

    # training and sampling
    temperature = 0.5
    how_many = 70
    vocab = generate.get_vocab(args.token, small=args.small)

    # create the vocab, model, (and embedding)
    if args.token == 'word':
        emb = generate.get_embedding('word2vec')
        input_size = emb.vectors.shape[1]
        output_size = emb.vectors.shape[0]
    elif args.token == 'character':
        emb = None
        input_size = vocab.size
        output_size = vocab.size

    model = LanguageModel(args.cell, input_size, args.hidden_size, output_size)

    # create criterion and optimiser
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)

    # create the validation set
    n_valid = 10000
    valid_gen = generate.generate('valid', token=args.token, max_len=args.max_len, small=args.small, batch_size=n_valid)
    for valid_batch, valid_labels in valid_gen:
        # one hot encode
        if args.token == 'character':
            valid_batch = generate.one_hot_encode(valid_batch, vocab)
        # or embed
        elif args.token == 'word':
            valid_batch = generate.w2v_encode(valid_batch, emb, vocab)
        valid_batch, valid_labels = torch.Tensor(valid_batch), torch.Tensor(valid_labels).long()
        break

    # how many epochs do we need?
    batches_per_epoch = generate.get_n_batches_in_epoch('train', args.token, args.batch_size, args.max_len, args.small)

    # training settings
    every_n = int(batches_per_epoch/args.n_saves) if not args.debug else 50
    running_loss = 0
    training_losses = []
    valid_losses = []
    t0 = time.time()
 
    # dump the settings
    pickle.dump(settings, open(model_dir/ 'settings.pkl', 'wb'))
    out_stream = model_dir / 'out_stream.txt'

    # run the training loop
    for epoch in range(1, args.n_epochs+1):

        opening = ['', '#'*20, '# Epoch {} (t={:2.2f}h)'.format(epoch, (time.time() - t0)/3600.), '#'*20, '']
        for txt in opening:
            utils.report(txt, out_stream)

        # create the generator for each epoch
        train_gen = generate.generate('train', token=args.token, max_len=args.max_len,
                                      small=args.small, batch_size=args.batch_size)
        for i, (batch, labels) in enumerate(train_gen):

            # one hot encode
            if args.token == 'character':
                batch = generate.one_hot_encode(batch, vocab)
            # or embed
            elif args.token == 'word':
                batch = generate.w2v_encode(batch, emb, vocab)

            # turn into torch tensors
            batch = torch.Tensor(batch)
            labels = torch.Tensor(labels).long()

            # zero the gradients
            optimizer.zero_grad()

            # forward and backward pass and optimisation step
            outputs = model(batch)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # monitor the losses
            running_loss += loss
            if i % every_n == (every_n-1):

                # append the training losses
                training_losses.append(float(running_loss/every_n))
                running_loss = 0

                # compute the valid loss
                valid_outputs = model(valid_batch)
                valid_losses.append(float(criterion(valid_outputs, valid_labels)))

                # monitor progress
                monitor = ['\n{}/{} done'.format(i+1, batches_per_epoch)]
                monitor.append(generate.compose(model, vocab, emb, 'The Standard Model of', temperature, how_many))
                for m in monitor:
                    utils.report(m, out_stream)
                
                # save the model
                torch.save(model.state_dict(), model_dir/'checkpoints'/'epoch{}_step_{}.pt'.format(epoch, round(i/every_n)))

            if i >= 1000 and args.debug:
                break
    
    # save information
    dt = (time.time() - t0)
    time_txt = '\ntime taken: {:2.2f}h\n'.format(dt/3600.)
    utils.report(time_txt, out_stream)
    utils.report(str(dt/3600.), model_dir/'time.txt')
        
    loss_dict = {'train':training_losses, 'valid':valid_losses, 'time_taken':dt}
    pickle.dump(loss_dict, open(model_dir/ 'losses.pkl', 'wb'))

    # evaluate
    evaluate.plot_losses(model_dir)
Beispiel #24
0
def main():
    '''
    Main function that coordinates the entire process. Parses arguments that specify the exercise and the
    experiment that should be run. Initializes the model and the checkpoint managers.
    '''

    parser = argparse.ArgumentParser(
        description='Define configuration of experiments')
    parser.add_argument('--mode',
                        type=str,
                        nargs='+',
                        choices=['train', 'evaluate', 'generate'],
                        required=True)
    parser.add_argument('--experiment',
                        type=str,
                        choices=['a', 'b', 'c'],
                        required=True)
    parser.add_argument('--id', type=str, required=False)
    parser.add_argument('--epochs', type=int, default=EPOCHS, required=False)

    args = parser.parse_args()

    # Setting Experiment Id
    if args.id is None:
        exp_id = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        print(f"No Experiment Id Set, Creating New: {exp_id}")
    else:
        exp_id = args.id
        print(f"Using Experiment Id: {exp_id}")

    # Setting Directories
    base_dir = f"{OUTPUT_DIR}/exp_{args.experiment}/{exp_id}"
    log_dir = f"{base_dir}/logs"
    submission_dir = f"{base_dir}/submissions"
    if not os.path.exists(submission_dir):
        os.makedirs(submission_dir)
    ckpt_dir = f"{base_dir}/ckpts"

    print(f"Experiment Directory: {base_dir}")

    print(f"Using Tensorflow Version: {tf.__version__}")
    print("Building Vocabulary...")
    build_vocab(input_file=PATH_TRAIN,
                output_file=PATH_VOCAB,
                top_k=VOCAB_SIZE,
                special=SPECIAL)
    word2id, id2word = build_vocab_lookup(PATH_VOCAB, "<unk>")

    # Setting Experiment Specific Configurations
    if args.experiment == 'a':
        lstm_hidden_state_size = 512
        word_embeddings = None

    elif args.experiment == 'b':
        lstm_hidden_state_size = 512
        word_embeddings = load_embedding(dim_embedding=EMBEDDING_SIZE,
                                         vocab_size=VOCAB_SIZE)

    elif args.experiment == 'c':
        lstm_hidden_state_size = 1024
        word_embeddings = load_embedding(dim_embedding=EMBEDDING_SIZE,
                                         vocab_size=VOCAB_SIZE)
    else:
        raise ValueError(f"Unknown Experiment {args.experiment}")

    print(f'Initializing Model...')
    model = LanguageModel(vocab_size=VOCAB_SIZE,
                          sentence_length=SENTENCE_LENGTH,
                          embedding_size=EMBEDDING_SIZE,
                          hidden_state_size=lstm_hidden_state_size,
                          output_size=LSTM_OUTPUT_SIZE,
                          batch_size=BATCH_SIZE,
                          word_embeddings=word_embeddings,
                          index_to_word_table=id2word)

    print(f'Initializing Optimizer...')
    optimizer = tf.keras.optimizers.Adam()

    ckpt = tf.train.Checkpoint(step=tf.Variable(1),
                               optimizer=optimizer,
                               net=model)
    manager = tf.train.CheckpointManager(ckpt, ckpt_dir, max_to_keep=5)

    if manager.latest_checkpoint:
        print(f"Restoring Model from {manager.latest_checkpoint}...")
        ckpt.restore(manager.latest_checkpoint)
        model_loaded = True
    else:
        print("Initializing Model from Scratch")
        model_loaded = False

    if "train" in args.mode:
        print(f"Starting Training...")
        train_summary_writer = tf.summary.create_file_writer(
            f"{log_dir}/train")
        with train_summary_writer.as_default():
            train(ckpt=ckpt,
                  manager=manager,
                  model=model,
                  optimizer=optimizer,
                  word2id=word2id,
                  id2word=id2word,
                  epochs=args.epochs)
        model_loaded = True

    if "evaluate" in args.mode:
        print(f"Starting Evaluation...")
        assert model_loaded, 'model must be loaded from checkpoint in order to be evaluated'

        test_summary_writer = tf.summary.create_file_writer(
            f"{log_dir}/evaluate")
        with test_summary_writer.as_default():
            evaluate(
                model=model,
                word2id=word2id,
                id2word=id2word,
                step=optimizer.iterations,
                path_submission=
                f"{submission_dir}/group35.perplexity{args.experiment.upper()}"
            )

    if "generate" in args.mode:
        print(f"Starting Generation...")
        assert model_loaded, 'model must be loaded from checkpoint in order to start generation'

        generate_summary_writer = tf.summary.create_file_writer(
            f"{log_dir}/generate")
        with generate_summary_writer.as_default():
            generate(word2id,
                     id2word,
                     model=model,
                     path_submission=f"{submission_dir}/group35.continuation")
Beispiel #25
0
from dataset import TextDataLoaderIterator
from model import LanguageModel

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


data_dir = 'data/Gutenberg/split/'
txt_files = [data_dir + file_name for file_name in os.listdir(data_dir)][:5]


if __name__ == '__main__':

    # checkpoint = torch.load('models/lm/latest.pth')

    model = LanguageModel(n_vocab=10000).to(device)
    # model.load_state_dict(checkpoint['model_state_dict'])
    optimizer = optim.Adam(model.parameters(), lr=1e-4)
    # optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.95, patience=100, min_lr=1e-6)
    # lr_scheduler.load_state_dict(checkpoint['lr_scheduler_state_dict'])
    criterion = nn.CrossEntropyLoss()

    writer = SummaryWriter(f'runs/{time.strftime('%Y%m%d-%I:%M%p', time.localtime())}')
    dummy_input = torch.LongTensor([[1]]).to(device)
    writer.add_graph(model, dummy_input)

    # global_step = checkpoint['global_step']
    global_step = 0

    for epoch in range(10):
Beispiel #26
0

if __name__ == "__main__":
    uniqueService = UniqueService(APP_DBUS_NAME, APP_OBJECT_NAME)

    app = QApplication(sys.argv)
    tray_icon = SystemTrayIcon(
        QIcon(os.path.join(get_parent_dir(__file__), "image", "trayicon.png")),
        app)
    tray_icon.show()
    (constant.TRAYAREA_TOP,
     constant.TRAYAREA_BOTTOM) = tray_icon.get_trayarea()

    plugin = Plugin()

    source_lang_model = LanguageModel()
    dest_lang_model = LanguageModel()

    word_engine_name = setting_config.get_translate_config("word_engine")
    words_engine_name = setting_config.get_translate_config("words_engine")
    translate_simple = imp.load_source(
        "translate_simple",
        plugin.get_plugin_file(word_engine_name)).Translate()
    translate_long = imp.load_source(
        "translate_long",
        plugin.get_plugin_file(words_engine_name)).Translate()
    word_translate_model = plugin.get_word_model(
        setting_config.get_translate_config("src_lang"),
        setting_config.get_translate_config("dst_lang"))
    words_translate_model = plugin.get_words_model(
        setting_config.get_translate_config("src_lang"),