Esempio n. 1
0
 def init_vocab(self, data):
     assert self.eval == False  # for eval vocab must exist
     charvocab = CharVocab(data, idx=0)
     wordvocab = WordVocab(data, idx=1, cutoff=self.word_cutoff, lower=True)
     posvocab = WordVocab(data, idx=2)
     featsvocab = FeatureVocab(data, idx=3)
     vocab = MultiVocab({
         'char': charvocab,
         'word': wordvocab,
         'pos': posvocab,
         'feats': featsvocab
     })
     return vocab
Esempio n. 2
0
def train(corpus_file, mode, dim_size, window, min_count, negative, epoch,
          workers):
    with bz2.BZ2File(corpus_file) as f:
        sentences = LineSentence(f)
        sg = int(mode == 'sg')

        model = Word2Vec(sentences,
                         size=dim_size,
                         window=window,
                         min_count=min_count,
                         workers=workers,
                         iter=epoch,
                         negative=negative,
                         sg=sg)

    words = []
    entities = []
    for (w, _) in model.vocab.iteritems():
        if w.startswith(MARKER):
            entities.append(w[len(MARKER):].replace(u'_', u' '))
        else:
            words.append(w)

    word_vocab = WordVocab(Trie(words), lowercase=True)
    entity_vocab = EntityVocab(Trie(entities))

    word_embedding = np.zeros((len(words), dim_size), dtype=np.float32)
    entity_embedding = np.zeros((len(entities), dim_size), dtype=np.float32)
    for word in words:
        ind = word_vocab.get_index(word)
        if ind is not None:
            word_embedding[ind] = model[word]

    for entity in entities:
        entity_embedding[entity_vocab.get_index(entity)] = model[
            MARKER + entity.replace(u' ', u'_')]

    return EmbeddingReader(word_embedding, entity_embedding, word_vocab,
                           entity_vocab)
Esempio n. 3
0
 def __init__(self, filename, vocabs=None):
     self._filename = filename
     self._vocabs = vocabs if vocabs else [
         WordVocab(),
         LemmaVocab(),
         TagVocab(),
         PredictVocab(),
         SemTagVocab()
     ]
     self._input_vocabs = self.vocabs[:-1]
     self._target_vocabs = [self.vocabs[-1]]
     self._establish_vocab()
     self._data = []
     self._read_data(self._filename, self._data)
Esempio n. 4
0
def decode(vocab_file, model_file, input_file, no_prog):
    d = pickle.load(open(vocab_file, "rb"))
    wv = WordVocab.from_dump(d["vocab"]["word"])
    tv = {k: TableVocab.from_dump(v) for k, v in d["vocab"]["table"].items()}
    writer = d["author"] if "writer" in model_file else None
    model = Reporter.parse_config(tv=tv,
                                  wv=wv,
                                  writer=writer,
                                  model_file=model_file)

    inputs = json.load(open(input_file))
    for ins in tqdm(inputs, total=len(inputs), ncols=80, disable=no_prog):
        print(
            model.decode(
                make_table(ins),
                writer=writer.get(ins.get("author"), 0) if writer else None))
Esempio n. 5
0
def prep(dir_path, annotation_path, out_path):
    desc = str(datetime.datetime.now(
    )) + " Overwrite the preprocessed data: {}? Y/n (default: n)".format(
        out_path)
    if os.path.exists(out_path) and input(desc) != "Y":
        print(str(datetime.datetime.now()) + " Exit.")
        exit()

    print(str(datetime.datetime.now()) + " Building dataset from " + dir_path)

    train = json.load(open(os.path.join(dir_path, "train.json")))
    tables, texts = [make_table(ins)
                     for ins in train], list(make_text(train, annotation_path))
    authors = {ins.get("author", "UNK") for ins in train}
    assert len(tables) == len(texts)

    tv = {}
    for k in ("team", "player"):
        tv[k] = TableVocab([t[k] for t in tables], key=k)
    wv = WordVocab({w for doc in texts for sent, _ in doc for w in sent})

    print(str(datetime.datetime.now()) + " Saving dataset from " + out_path)
    pickle.dump(
        {
            "data": {
                "text": texts,
                "table": tables
            },
            "vocab": {
                "word": wv.__dict__,
                "table": {k: v.__dict__
                          for k, v in tv.items()}
            },
            "author": {k: i
                       for i, k in enumerate(authors)}
        }, open(out_path, "wb"))
Esempio n. 6
0
    epochs = 10
    num_workers = 1

    with_cuda = False
    log_freq = 20
    corpus_lines = None
    cuda_devices = None
    on_memory = True

    lr = 1e-4
    adam_weight_decay = 0.01
    adam_beta1 = 0.9
    adam_beta2 = 0.999

    print("Loading Vocab", vocab_path)
    vocab = WordVocab.load_vocab(vocab_path)
    print("Vocab Size: ", len(vocab))

    print("Loading Train Dataset", train_dataset)
    train_dataset = BERTDataset(train_dataset,
                                vocab,
                                seq_len=seq_len,
                                corpus_lines=corpus_lines,
                                on_memory=on_memory)

    print("Loading Test Dataset", test_dataset)
    test_dataset = BERTDataset(test_dataset, vocab, seq_len=seq_len, on_memory=on_memory) \
        if test_dataset is not None else None

    print("Creating Dataloader")
    train_data_loader = DataLoader(train_dataset,
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm

VOCAB_SAVE_PATH = "./data/saves/vocab/"

from vocab import CharVocab, WordVocab

CHAR_VOCAB = CharVocab()
WORD_VOCAB = WordVocab()

EMBEDD_MATRIX = None

VECTOR_DIM = 20


def save_():
    pickle_out = open(VOCAB_SAVE_PATH + 'char_vocab.pickle', 'wb')
    pickle.dump(CHAR_VOCAB, pickle_out)
    pickle_out.close()

    pickle_out = open('./data/saves/char_embedding.pickle', 'wb')
    pickle.dump(EMBEDD_MATRIX, pickle_out)
    pickle_out.close()


def load_word_vocab():
    global WORD_VOCAB
    WORD_VOCAB = pickle.load(open(VOCAB_SAVE_PATH + 'word_vocab.pickle', 'rb'))
Esempio n. 8
0
                        type=int,
                        default=1,
                        help="number of batch_size")
    parser.add_argument(
        "--multi_segment",
        type=bool,
        default=False,
        help="whether to use multiple segment_labels for entity types")
    parser.add_argument("--sep_label",
                        type=bool,
                        default=False,
                        help="whether to insert <sep>")
    args = parser.parse_args()

    print("Loading Vocab", args.vocab_path)
    vocab = WordVocab.load_vocab(args.vocab_path)
    print("Vocab Size: ", len(vocab))

    print("Loading Train Dataset", args.train_dataset)
    train_dataset = BERTDataset(args.train_dataset,
                                vocab,
                                seq_len=args.seq_len,
                                corpus_lines=args.corpus_lines,
                                on_memory=args.on_memory,
                                multi_segment=args.multi_segment,
                                sep=args.sep_label)

    print("Creating Dataloader")
    train_data_loader = DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   num_workers=args.num_workers)
Esempio n. 9
0
def main(experiment_path, dataset_path, config_path, restore_path, workers):
    logging.basicConfig(level=logging.INFO)
    config = Config.from_json(config_path)
    fix_seed(config.seed)

    train_data = pd.concat([
        load_data(os.path.join(dataset_path, 'train-clean-100'),
                  workers=workers),
        load_data(os.path.join(dataset_path, 'train-clean-360'),
                  workers=workers),
    ])
    eval_data = pd.concat([
        load_data(os.path.join(dataset_path, 'dev-clean'), workers=workers),
    ])

    if config.vocab == 'char':
        vocab = CharVocab(CHAR_VOCAB)
    elif config.vocab == 'word':
        vocab = WordVocab(train_data['syms'], 30000)
    elif config.vocab == 'subword':
        vocab = SubWordVocab(10000)
    else:
        raise AssertionError('invalid config.vocab: {}'.format(config.vocab))

    train_transform = T.Compose([
        ApplyTo(['sig'], T.Compose([
            LoadSignal(SAMPLE_RATE),
            ToTensor(),
        ])),
        ApplyTo(['syms'], T.Compose([
            VocabEncode(vocab),
            ToTensor(),
        ])),
        Extract(['sig', 'syms']),
    ])
    eval_transform = train_transform

    train_dataset = TrainEvalDataset(train_data, transform=train_transform)
    eval_dataset = TrainEvalDataset(eval_data, transform=eval_transform)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_sampler=BatchSampler(train_data,
                                   batch_size=config.batch_size,
                                   shuffle=True,
                                   drop_last=True),
        num_workers=workers,
        collate_fn=collate_fn)

    eval_data_loader = torch.utils.data.DataLoader(
        eval_dataset,
        batch_sampler=BatchSampler(eval_data, batch_size=config.batch_size),
        num_workers=workers,
        collate_fn=collate_fn)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = Model(SAMPLE_RATE, len(vocab))
    model_to_save = model
    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)
    model.to(device)
    if restore_path is not None:
        load_weights(model_to_save, restore_path)

    if config.opt.type == 'adam':
        optimizer = torch.optim.Adam(model.parameters(),
                                     config.opt.lr,
                                     weight_decay=1e-4)
    elif config.opt.type == 'sgd':
        optimizer = torch.optim.SGD(model.parameters(),
                                    config.opt.lr,
                                    momentum=0.9,
                                    weight_decay=1e-4)
    else:
        raise AssertionError('invalid config.opt.type {}'.format(
            config.opt.type))

    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer,
        len(train_data_loader) * config.epochs)

    # ==================================================================================================================
    # main loop

    train_writer = SummaryWriter(os.path.join(experiment_path, 'train'))
    eval_writer = SummaryWriter(os.path.join(experiment_path, 'eval'))
    best_wer = float('inf')

    for epoch in range(config.epochs):
        if epoch % 10 == 0:
            logging.info(experiment_path)

        # ==============================================================================================================
        # training

        metrics = {
            'loss': Mean(),
            'fps': Mean(),
        }

        model.train()
        t1 = time.time()
        for (sigs, labels), (sigs_mask, labels_mask) in tqdm(
                train_data_loader,
                desc='epoch {} training'.format(epoch),
                smoothing=0.01):
            sigs, labels = sigs.to(device), labels.to(device)
            sigs_mask, labels_mask = sigs_mask.to(device), labels_mask.to(
                device)

            logits, etc = model(sigs, labels[:, :-1], sigs_mask,
                                labels_mask[:, :-1])

            loss = compute_loss(input=logits,
                                target=labels[:, 1:],
                                mask=labels_mask[:, 1:],
                                smoothing=config.label_smoothing)
            metrics['loss'].update(loss.data.cpu().numpy())

            lr = np.squeeze(scheduler.get_lr())

            optimizer.zero_grad()
            loss.mean().backward()
            optimizer.step()
            scheduler.step()

            t2 = time.time()
            metrics['fps'].update(1 / ((t2 - t1) / sigs.size(0)))
            t1 = t2

        with torch.no_grad():
            metrics = {k: metrics[k].compute_and_reset() for k in metrics}
            print('[EPOCH {}][TRAIN] {}'.format(
                epoch, ', '.join('{}: {:.4f}'.format(k, metrics[k])
                                 for k in metrics)))
            for k in metrics:
                train_writer.add_scalar(k, metrics[k], global_step=epoch)
            train_writer.add_scalar('learning_rate', lr, global_step=epoch)

            train_writer.add_image('spectras',
                                   torchvision.utils.make_grid(
                                       etc['spectras'],
                                       nrow=compute_nrow(etc['spectras']),
                                       normalize=True),
                                   global_step=epoch)
            for k in etc['weights']:
                w = etc['weights'][k]
                train_writer.add_image('weights/{}'.format(k),
                                       torchvision.utils.make_grid(
                                           w,
                                           nrow=compute_nrow(w),
                                           normalize=True),
                                       global_step=epoch)

            for i, (true, pred) in enumerate(
                    zip(labels[:, 1:][:4].detach().data.cpu().numpy(),
                        np.argmax(logits[:4].detach().data.cpu().numpy(),
                                  -1))):
                print('{}:'.format(i))
                text = vocab.decode(
                    take_until_token(true.tolist(), vocab.eos_id))
                print(colored(text, 'green'))
                text = vocab.decode(
                    take_until_token(pred.tolist(), vocab.eos_id))
                print(colored(text, 'yellow'))

        # ==============================================================================================================
        # evaluation

        metrics = {
            # 'loss': Mean(),
            'wer': Mean(),
        }

        model.eval()
        with torch.no_grad(), Pool(workers) as pool:
            for (sigs, labels), (sigs_mask, labels_mask) in tqdm(
                    eval_data_loader,
                    desc='epoch {} evaluating'.format(epoch),
                    smoothing=0.1):
                sigs, labels = sigs.to(device), labels.to(device)
                sigs_mask, labels_mask = sigs_mask.to(device), labels_mask.to(
                    device)

                logits, etc = model.infer(sigs,
                                          sigs_mask,
                                          sos_id=vocab.sos_id,
                                          eos_id=vocab.eos_id,
                                          max_steps=labels.size(1) + 10)

                # loss = compute_loss(
                #     input=logits, target=labels[:, 1:], mask=labels_mask[:, 1:], smoothing=config.label_smoothing)
                # metrics['loss'].update(loss.data.cpu().numpy())

                wer = compute_wer(input=logits,
                                  target=labels[:, 1:],
                                  vocab=vocab,
                                  pool=pool)
                metrics['wer'].update(wer)

        with torch.no_grad():
            metrics = {k: metrics[k].compute_and_reset() for k in metrics}
            print('[EPOCH {}][EVAL] {}'.format(
                epoch, ', '.join('{}: {:.4f}'.format(k, metrics[k])
                                 for k in metrics)))
            for k in metrics:
                eval_writer.add_scalar(k, metrics[k], global_step=epoch)

            eval_writer.add_image('spectras',
                                  torchvision.utils.make_grid(
                                      etc['spectras'],
                                      nrow=compute_nrow(etc['spectras']),
                                      normalize=True),
                                  global_step=epoch)
            for k in etc['weights']:
                w = etc['weights'][k]
                eval_writer.add_image('weights/{}'.format(k),
                                      torchvision.utils.make_grid(
                                          w,
                                          nrow=compute_nrow(w),
                                          normalize=True),
                                      global_step=epoch)

        save_model(model_to_save, experiment_path)
        if metrics['wer'] < best_wer:
            best_wer = metrics['wer']
            save_model(model_to_save,
                       mkdir(os.path.join(experiment_path, 'best')))
Esempio n. 10
0
def train(vocab_file, valid_file, nh_vocab, nh_rnn, writer, learning_rate,
          lr_decay, batch_size, n_epoch, log_dir):
    log_dir = os.path.join(log_dir, str(int(time.time())))

    # Initialize...
    print(str(datetime.datetime.now()) + " Log dir at {}".format(log_dir))
    os.mkdir(log_dir)
    print(str(datetime.datetime.now()) + " Loading dataset...")
    d = pickle.load(open(vocab_file, "rb"))
    texts, tables = d["data"]["text"], d["data"]["table"]
    wv = WordVocab.from_dump(d["vocab"]["word"])
    tv = {k: TableVocab.from_dump(v) for k, v in d["vocab"]["table"].items()}
    writer = d["author"] if writer else None

    print(str(datetime.datetime.now()) + " Vectorizing...")
    data = list(vectorize(texts, tables, wv, tv, writer))

    valid = json.load(open(valid_file)) if valid_file else None

    # Model
    model = Reporter(tv=tv,
                     wv=wv,
                     nh_vocab=nh_vocab,
                     nh_rnn=nh_rnn,
                     writer=writer)
    print(str(datetime.datetime.now()) + " Model configurations...")
    print(str(datetime.datetime.now()) + " " + str(model))

    # Trainer
    trainer = Trainer(model,
                      lr=learning_rate,
                      decay=lr_decay,
                      batch_size=batch_size)
    print(str(datetime.datetime.now()) + " Trainer configurations...")
    print(str(datetime.datetime.now()) + " " + str(trainer))

    try:
        best = 0.
        print(str(datetime.datetime.now()) + " Start training...")
        for _ in range(n_epoch):
            trainer.fit_partial(data)
            pc_name = str(model) + "_{}.dy".format(trainer.iter)
            model.pc.save(os.path.join(log_dir, pc_name))

            if valid and trainer.iter >= 5:
                pred = []
                prog = tqdm(
                    desc="Evaluation: ",
                    total=len(valid) + 1,
                    ncols=80,
                )
                for ins in valid:
                    p = model.decode(make_table(ins),
                                     writer=writer.get(ins.get("author"))
                                     if writer else None)
                    pred.append(p.split())
                    prog.update()

                bleu = nltk.translate.bleu_score.corpus_bleu(
                    [[nltk.word_tokenize(' '.join(v["summary"]))]
                     for v in valid], pred)
                prog.set_postfix(BLEU=bleu)
                prog.update()
                prog.close()
                if bleu > best:
                    best = bleu
                    print(str(datetime.datetime.now()) + " Save best model...")
                    model.pc.save(
                        os.path.join(log_dir,
                                     str(model) + "_best.dy"))

    except KeyboardInterrupt:
        print("KeyboardInterrupted...")
def get_vocab(data):
    char_vocab = CharVocab(data, idx=0)
    word_vocab = WordVocab(data, idx=0, cutoff=2)
    vocab = MultiVocab({'char': char_vocab, 'word': word_vocab})
    return vocab
Esempio n. 12
0
def train():
    parser = argparse.ArgumentParser()

    parser.add_argument("-c",
                        "--train_dataset",
                        required=True,
                        type=str,
                        help="train dataset for train bert")
    parser.add_argument("-t",
                        "--test_dataset",
                        type=str,
                        default=None,
                        help="test set for evaluate train set")
    parser.add_argument("-v",
                        "--vocab_path",
                        required=True,
                        type=str,
                        help="built vocab model path with bert-vocab")
    parser.add_argument("-o",
                        "--output_path",
                        required=True,
                        type=str,
                        help="ex)output/bert.model")
    parser.add_argument("-bt",
                        "--bert_path",
                        type=str,
                        help='path of pretrained bert')

    parser.add_argument("-hs",
                        "--hidden",
                        type=int,
                        default=256,
                        help="hidden size of transformer model")
    parser.add_argument("-l",
                        "--layers",
                        type=int,
                        default=8,
                        help="number of layers")
    parser.add_argument("-a",
                        "--attn_heads",
                        type=int,
                        default=8,
                        help="number of attention heads")
    parser.add_argument("-s",
                        "--seq_len",
                        type=int,
                        default=100,
                        help="maximum sequence len")

    parser.add_argument("-b",
                        "--batch_size",
                        type=int,
                        default=64,
                        help="number of batch_size")
    parser.add_argument("-e",
                        "--epochs",
                        type=int,
                        default=10,
                        help="number of epochs")
    parser.add_argument("-w",
                        "--num_workers",
                        type=int,
                        default=4,
                        help="dataloader worker size")

    parser.add_argument("--with_cuda",
                        type=bool,
                        default=True,
                        help="training with CUDA: true, or false")
    parser.add_argument("--log_freq",
                        type=int,
                        default=10,
                        help="printing loss every n iter: setting n")
    parser.add_argument("--cuda_devices",
                        type=int,
                        nargs='+',
                        default=None,
                        help="CUDA device ids")

    parser.add_argument("--lr",
                        type=float,
                        default=1e-3,
                        help="learning rate of adam")
    parser.add_argument("--adam_weight_decay",
                        type=float,
                        default=0.01,
                        help="weight_decay of adam")
    parser.add_argument("--adam_beta1",
                        type=float,
                        default=0.9,
                        help="adam first beta value")
    parser.add_argument("--adam_beta2",
                        type=float,
                        default=0.999,
                        help="adam first beta value")

    d = vars(parser.parse_args())
    args = Option(d)

    print("Loading Vocab", args.vocab_path)
    vocab = WordVocab.load_vocab(args.vocab_path)
    print("Vocab Size: ", len(vocab))

    print("Loading Train Dataset", args.train_dataset)
    train_dataset = Dataset(args.train_dataset, vocab, seq_len=args.seq_len)

    print("Loading Test Dataset", args.test_dataset)
    test_dataset = Dataset(args.test_dataset, vocab, seq_len=args.seq_len) \
        if args.test_dataset is not None else None

    print("Creating Dataloader")
    train_data_loader = DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   num_workers=args.num_workers)
    test_data_loader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) \
        if test_dataset is not None else None

    print("Building model")
    bert = torch.load(args.bert_path)
    model = Readout(bert.hidden, args.hidden)
    #model = bert_fc(args.bert_path, args.hidden)

    print("Creating Trainer")
    trainer = Trainer(args,
                      bert,
                      model,
                      train_dataloader=train_data_loader,
                      test_dataloader=test_data_loader)

    print("Training Start")
    for epoch in range(args.epochs):
        trainer.train(epoch)
        trainer.save(epoch, args.output_path)

        if test_data_loader is not None:
            trainer.test(epoch)