Ejemplo n.º 1
0
 def init_vocab(self, data):
     assert self.eval == False  # for eval vocab must exist
     charvocab = CharVocab(data, idx=0)
     wordvocab = WordVocab(data, idx=1, cutoff=self.word_cutoff, lower=True)
     posvocab = WordVocab(data, idx=2)
     featsvocab = FeatureVocab(data, idx=3)
     vocab = MultiVocab({
         'char': charvocab,
         'word': wordvocab,
         'pos': posvocab,
         'feats': featsvocab
     })
     return vocab
Ejemplo n.º 2
0
 def __init__(self, filename, vocabs=None):
     self._filename = filename
     self._vocabs = vocabs if vocabs else [
         WordVocab(),
         LemmaVocab(),
         TagVocab(),
         PredictVocab(),
         SemTagVocab()
     ]
     self._input_vocabs = self.vocabs[:-1]
     self._target_vocabs = [self.vocabs[-1]]
     self._establish_vocab()
     self._data = []
     self._read_data(self._filename, self._data)
Ejemplo n.º 3
0
def train(corpus_file, mode, dim_size, window, min_count, negative, epoch,
          workers):
    with bz2.BZ2File(corpus_file) as f:
        sentences = LineSentence(f)
        sg = int(mode == 'sg')

        model = Word2Vec(sentences,
                         size=dim_size,
                         window=window,
                         min_count=min_count,
                         workers=workers,
                         iter=epoch,
                         negative=negative,
                         sg=sg)

    words = []
    entities = []
    for (w, _) in model.vocab.iteritems():
        if w.startswith(MARKER):
            entities.append(w[len(MARKER):].replace(u'_', u' '))
        else:
            words.append(w)

    word_vocab = WordVocab(Trie(words), lowercase=True)
    entity_vocab = EntityVocab(Trie(entities))

    word_embedding = np.zeros((len(words), dim_size), dtype=np.float32)
    entity_embedding = np.zeros((len(entities), dim_size), dtype=np.float32)
    for word in words:
        ind = word_vocab.get_index(word)
        if ind is not None:
            word_embedding[ind] = model[word]

    for entity in entities:
        entity_embedding[entity_vocab.get_index(entity)] = model[
            MARKER + entity.replace(u' ', u'_')]

    return EmbeddingReader(word_embedding, entity_embedding, word_vocab,
                           entity_vocab)
Ejemplo n.º 4
0
def prep(dir_path, annotation_path, out_path):
    desc = str(datetime.datetime.now(
    )) + " Overwrite the preprocessed data: {}? Y/n (default: n)".format(
        out_path)
    if os.path.exists(out_path) and input(desc) != "Y":
        print(str(datetime.datetime.now()) + " Exit.")
        exit()

    print(str(datetime.datetime.now()) + " Building dataset from " + dir_path)

    train = json.load(open(os.path.join(dir_path, "train.json")))
    tables, texts = [make_table(ins)
                     for ins in train], list(make_text(train, annotation_path))
    authors = {ins.get("author", "UNK") for ins in train}
    assert len(tables) == len(texts)

    tv = {}
    for k in ("team", "player"):
        tv[k] = TableVocab([t[k] for t in tables], key=k)
    wv = WordVocab({w for doc in texts for sent, _ in doc for w in sent})

    print(str(datetime.datetime.now()) + " Saving dataset from " + out_path)
    pickle.dump(
        {
            "data": {
                "text": texts,
                "table": tables
            },
            "vocab": {
                "word": wv.__dict__,
                "table": {k: v.__dict__
                          for k, v in tv.items()}
            },
            "author": {k: i
                       for i, k in enumerate(authors)}
        }, open(out_path, "wb"))
Ejemplo n.º 5
0
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm

VOCAB_SAVE_PATH = "./data/saves/vocab/"

from vocab import CharVocab, WordVocab

CHAR_VOCAB = CharVocab()
WORD_VOCAB = WordVocab()

EMBEDD_MATRIX = None

VECTOR_DIM = 20


def save_():
    pickle_out = open(VOCAB_SAVE_PATH + 'char_vocab.pickle', 'wb')
    pickle.dump(CHAR_VOCAB, pickle_out)
    pickle_out.close()

    pickle_out = open('./data/saves/char_embedding.pickle', 'wb')
    pickle.dump(EMBEDD_MATRIX, pickle_out)
    pickle_out.close()


def load_word_vocab():
    global WORD_VOCAB
    WORD_VOCAB = pickle.load(open(VOCAB_SAVE_PATH + 'word_vocab.pickle', 'rb'))
Ejemplo n.º 6
0
def main(experiment_path, dataset_path, config_path, restore_path, workers):
    logging.basicConfig(level=logging.INFO)
    config = Config.from_json(config_path)
    fix_seed(config.seed)

    train_data = pd.concat([
        load_data(os.path.join(dataset_path, 'train-clean-100'),
                  workers=workers),
        load_data(os.path.join(dataset_path, 'train-clean-360'),
                  workers=workers),
    ])
    eval_data = pd.concat([
        load_data(os.path.join(dataset_path, 'dev-clean'), workers=workers),
    ])

    if config.vocab == 'char':
        vocab = CharVocab(CHAR_VOCAB)
    elif config.vocab == 'word':
        vocab = WordVocab(train_data['syms'], 30000)
    elif config.vocab == 'subword':
        vocab = SubWordVocab(10000)
    else:
        raise AssertionError('invalid config.vocab: {}'.format(config.vocab))

    train_transform = T.Compose([
        ApplyTo(['sig'], T.Compose([
            LoadSignal(SAMPLE_RATE),
            ToTensor(),
        ])),
        ApplyTo(['syms'], T.Compose([
            VocabEncode(vocab),
            ToTensor(),
        ])),
        Extract(['sig', 'syms']),
    ])
    eval_transform = train_transform

    train_dataset = TrainEvalDataset(train_data, transform=train_transform)
    eval_dataset = TrainEvalDataset(eval_data, transform=eval_transform)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_sampler=BatchSampler(train_data,
                                   batch_size=config.batch_size,
                                   shuffle=True,
                                   drop_last=True),
        num_workers=workers,
        collate_fn=collate_fn)

    eval_data_loader = torch.utils.data.DataLoader(
        eval_dataset,
        batch_sampler=BatchSampler(eval_data, batch_size=config.batch_size),
        num_workers=workers,
        collate_fn=collate_fn)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = Model(SAMPLE_RATE, len(vocab))
    model_to_save = model
    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)
    model.to(device)
    if restore_path is not None:
        load_weights(model_to_save, restore_path)

    if config.opt.type == 'adam':
        optimizer = torch.optim.Adam(model.parameters(),
                                     config.opt.lr,
                                     weight_decay=1e-4)
    elif config.opt.type == 'sgd':
        optimizer = torch.optim.SGD(model.parameters(),
                                    config.opt.lr,
                                    momentum=0.9,
                                    weight_decay=1e-4)
    else:
        raise AssertionError('invalid config.opt.type {}'.format(
            config.opt.type))

    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer,
        len(train_data_loader) * config.epochs)

    # ==================================================================================================================
    # main loop

    train_writer = SummaryWriter(os.path.join(experiment_path, 'train'))
    eval_writer = SummaryWriter(os.path.join(experiment_path, 'eval'))
    best_wer = float('inf')

    for epoch in range(config.epochs):
        if epoch % 10 == 0:
            logging.info(experiment_path)

        # ==============================================================================================================
        # training

        metrics = {
            'loss': Mean(),
            'fps': Mean(),
        }

        model.train()
        t1 = time.time()
        for (sigs, labels), (sigs_mask, labels_mask) in tqdm(
                train_data_loader,
                desc='epoch {} training'.format(epoch),
                smoothing=0.01):
            sigs, labels = sigs.to(device), labels.to(device)
            sigs_mask, labels_mask = sigs_mask.to(device), labels_mask.to(
                device)

            logits, etc = model(sigs, labels[:, :-1], sigs_mask,
                                labels_mask[:, :-1])

            loss = compute_loss(input=logits,
                                target=labels[:, 1:],
                                mask=labels_mask[:, 1:],
                                smoothing=config.label_smoothing)
            metrics['loss'].update(loss.data.cpu().numpy())

            lr = np.squeeze(scheduler.get_lr())

            optimizer.zero_grad()
            loss.mean().backward()
            optimizer.step()
            scheduler.step()

            t2 = time.time()
            metrics['fps'].update(1 / ((t2 - t1) / sigs.size(0)))
            t1 = t2

        with torch.no_grad():
            metrics = {k: metrics[k].compute_and_reset() for k in metrics}
            print('[EPOCH {}][TRAIN] {}'.format(
                epoch, ', '.join('{}: {:.4f}'.format(k, metrics[k])
                                 for k in metrics)))
            for k in metrics:
                train_writer.add_scalar(k, metrics[k], global_step=epoch)
            train_writer.add_scalar('learning_rate', lr, global_step=epoch)

            train_writer.add_image('spectras',
                                   torchvision.utils.make_grid(
                                       etc['spectras'],
                                       nrow=compute_nrow(etc['spectras']),
                                       normalize=True),
                                   global_step=epoch)
            for k in etc['weights']:
                w = etc['weights'][k]
                train_writer.add_image('weights/{}'.format(k),
                                       torchvision.utils.make_grid(
                                           w,
                                           nrow=compute_nrow(w),
                                           normalize=True),
                                       global_step=epoch)

            for i, (true, pred) in enumerate(
                    zip(labels[:, 1:][:4].detach().data.cpu().numpy(),
                        np.argmax(logits[:4].detach().data.cpu().numpy(),
                                  -1))):
                print('{}:'.format(i))
                text = vocab.decode(
                    take_until_token(true.tolist(), vocab.eos_id))
                print(colored(text, 'green'))
                text = vocab.decode(
                    take_until_token(pred.tolist(), vocab.eos_id))
                print(colored(text, 'yellow'))

        # ==============================================================================================================
        # evaluation

        metrics = {
            # 'loss': Mean(),
            'wer': Mean(),
        }

        model.eval()
        with torch.no_grad(), Pool(workers) as pool:
            for (sigs, labels), (sigs_mask, labels_mask) in tqdm(
                    eval_data_loader,
                    desc='epoch {} evaluating'.format(epoch),
                    smoothing=0.1):
                sigs, labels = sigs.to(device), labels.to(device)
                sigs_mask, labels_mask = sigs_mask.to(device), labels_mask.to(
                    device)

                logits, etc = model.infer(sigs,
                                          sigs_mask,
                                          sos_id=vocab.sos_id,
                                          eos_id=vocab.eos_id,
                                          max_steps=labels.size(1) + 10)

                # loss = compute_loss(
                #     input=logits, target=labels[:, 1:], mask=labels_mask[:, 1:], smoothing=config.label_smoothing)
                # metrics['loss'].update(loss.data.cpu().numpy())

                wer = compute_wer(input=logits,
                                  target=labels[:, 1:],
                                  vocab=vocab,
                                  pool=pool)
                metrics['wer'].update(wer)

        with torch.no_grad():
            metrics = {k: metrics[k].compute_and_reset() for k in metrics}
            print('[EPOCH {}][EVAL] {}'.format(
                epoch, ', '.join('{}: {:.4f}'.format(k, metrics[k])
                                 for k in metrics)))
            for k in metrics:
                eval_writer.add_scalar(k, metrics[k], global_step=epoch)

            eval_writer.add_image('spectras',
                                  torchvision.utils.make_grid(
                                      etc['spectras'],
                                      nrow=compute_nrow(etc['spectras']),
                                      normalize=True),
                                  global_step=epoch)
            for k in etc['weights']:
                w = etc['weights'][k]
                eval_writer.add_image('weights/{}'.format(k),
                                      torchvision.utils.make_grid(
                                          w,
                                          nrow=compute_nrow(w),
                                          normalize=True),
                                      global_step=epoch)

        save_model(model_to_save, experiment_path)
        if metrics['wer'] < best_wer:
            best_wer = metrics['wer']
            save_model(model_to_save,
                       mkdir(os.path.join(experiment_path, 'best')))
def get_vocab(data):
    char_vocab = CharVocab(data, idx=0)
    word_vocab = WordVocab(data, idx=0, cutoff=2)
    vocab = MultiVocab({'char': char_vocab, 'word': word_vocab})
    return vocab