Esempio n. 1
0
def copy_files(path, dest):
    utils.maybe_create_dir(dest)
    for f in glob.glob(path):
        filename = f.split("/")[-1]
        shutil.copy(f, "%s/%s" % (dest, filename), follow_symlinks=True)
Esempio n. 2
0
def prepare_syllable_charater_seq_data(files,
                                       ch2ix,
                                       sy2ix,
                                       sampling=10,
                                       output_dir=""):
    training, validation = files

    if sampling:
        training = training[:sampling]
        validation = validation[:sampling]

    output_dir = "%s/best-syllable-crf-and-character-seq-feature-sampling-%d" % (
        output_dir, sampling)

    print("Saving data to %s" % output_dir)
    utils.maybe_create_dir(output_dir)
    for name, dataset in zip(("training", "val"), (training, validation)):
        print("working on : %s" % name)
        fout_txt = open("%s/%s.txt" % (output_dir, name), "w")
        try:
            for path in dataset:
                count = 0
                with open(path,
                          "r") as fin, open(path.replace(".txt", ".label"),
                                            "r") as flab:

                    has_space_problem = False

                    for txt, label in zip(fin, flab):
                        txt = txt.strip().replace("~~", "~")

                        if not txt:
                            continue

                        label = label.strip()
                        syllables = txt.split("~")

                        chars_idx = []
                        char_labels = []
                        syllable_idx = []

                        syllable_indices = list(
                            map(
                                lambda sy: preprocessing.syllable2ix(
                                    sy2ix, sy), syllables))

                        if len(syllables) != len(label):
                            print(txt, path)
                            print(len(syllables), len(label))
                            print(syllables)
                            print(label)
                            raise SystemExit("xx")

                        label = list(label)
                        for ii, (syllable, six, l) in enumerate(
                                zip(syllables, syllable_indices, label)):
                            if not syllable:
                                continue

                            if syllable == " ":
                                # next syllable is B, then we should also split this space
                                if label[ii + 1] == "1":
                                    l = "1"
                                else:
                                    l = "0"

                            chs = list(
                                map(
                                    lambda c: preprocessing.character2ix(
                                        ch2ix, c), list(syllable)))

                            total_chs = len(chs)
                            syllable_idx.extend([six] * total_chs)

                            chars_idx.extend(chs)
                            if l == "1":
                                char_labels.extend(["1"] + ["0"] *
                                                   (total_chs - 1))
                            else:
                                char_labels.extend(["0"] * total_chs)

                        assert len(char_labels) == len(chars_idx)

                        # check space problem
                        if not has_space_problem:
                            for cix, clb in zip(chars_idx, char_labels):
                                if cix == 3 and clb == "0":
                                    has_space_problem = True
                                    print(txt)
                                    break

                        fout_txt.write("%s::%s::%s\n" % (
                            "".join(char_labels),
                            " ".join(np.array(chars_idx).astype(str)),
                            " ".join(np.array(syllable_idx).astype(str)),
                        ))

                    if has_space_problem:
                        print("problem with space in %s" % path)

        finally:
            fout_txt.close()
Esempio n. 3
0
def main(model_name,
         data_dir,
         epoch=10,
         lr=0.001,
         batch_size=64,
         weight_decay=0.0,
         checkpoint=5,
         model_params="",
         output_dir="",
         no_workers=4,
         lr_schedule="",
         prev_model=""):

    model_cls = models.get_model(model_name)

    dataset_cls = model_cls.dataset()

    training_set: dl.SequenceDataset = dataset_cls.load_preprocessed_file_with_suffix(
        data_dir, "training.txt")

    validation_set: dl.SequenceDataset = dataset_cls.load_preprocessed_file_with_suffix(
        data_dir, "val.txt")

    # only required
    data_config = training_set.setup_featurizer("%s/dictionary" % data_dir)

    device = get_device()
    print("Using device: %s" % device)

    params = {}

    if model_params:
        params['model_config'] = model_params
        print(">> model configuration: %s" % model_params)

    if prev_model:
        print("Initiate model from %s" % prev_model)
        model = models.get_model(model_name).load(prev_model, data_config,
                                                  **params)
    else:
        model = models.get_model(model_name)(data_config, **params)

    model = model.to(device)

    criterion = torch.nn.BCEWithLogitsLoss()

    optimizer = optim.Adam(model.parameters(),
                           lr=lr,
                           weight_decay=weight_decay)

    if prev_model:
        print("Loading prev optmizer's state")
        optimizer.load_state_dict(torch.load("%s/optimizer.pth" % prev_model))
        print("Previous learning rate", get_lr(optimizer))

        # force torch to use the given lr, not previous one
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr
            param_group['initial_lr'] = lr

        print("Current learning rate", get_lr(optimizer))

    if lr_schedule:
        schedule_params = utils.parse_model_params(lr_schedule)
        scheduler = optim.lr_scheduler.StepLR(
            optimizer,
            step_size=schedule_params['step'],
            gamma=schedule_params['gamma'],
        )

    dataloader_params = dict(batch_size=batch_size,
                             num_workers=no_workers,
                             collate_fn=dataset_cls.collate_fn)

    print("Using dataset: %s" % type(dataset_cls).__name__)

    training_generator = data.DataLoader(training_set,
                                         shuffle=True,
                                         **dataloader_params)
    validation_generator = data.DataLoader(validation_set,
                                           shuffle=False,
                                           **dataloader_params)

    total_train_size = len(training_set)
    total_test_size = len(validation_set)

    print("We have %d train samples and %d test samples" %
          (total_train_size, total_test_size))

    # for FloydHub
    print('{"metric": "%s:%s", "value": %s}' %
          ("model", model_name, model.total_trainable_params()))

    utils.maybe_create_dir(output_dir)

    copy_files("%s/dictionary/*.json" % data_dir, output_dir)

    utils.save_training_params(
        output_dir,
        utils.ModelParams(name=model_name, params=model.model_params))

    for e in range(1, epoch + 1):
        print("===EPOCH %d ===" % (e))
        if lr_schedule:
            curr_lr = get_lr(optimizer)
            print_floydhub_metrics(dict(lr=curr_lr), step=e, prefix="global")
            print("lr: ", curr_lr)

        with utils.Timer("epoch-training") as timer:
            do_iterate(
                model,
                training_generator,
                prefix="training",
                step=e,
                device=device,
                optimizer=optimizer,
                criterion=criterion,
            )

        with utils.Timer("epoch-validation") as timer, \
            torch.no_grad():
            do_iterate(
                model,
                validation_generator,
                prefix="validation",
                step=e,
                device=device,
                criterion=criterion,
            )

        if lr_schedule:
            scheduler.step()

        if checkpoint and e % checkpoint == 0:
            model_path = "%s/model-e-%d.pth" % (output_dir, e)
            print("Saving model to %s" % model_path)
            torch.save(model.state_dict(), model_path)

    model_path = "%s/model.pth" % output_dir
    opt_path = "%s/optimizer.pth" % output_dir

    print("Saving model to %s" % model_path)
    torch.save(model.state_dict(), model_path)
    torch.save(optimizer.state_dict(), opt_path)