コード例 #1
0
ファイル: prepare_train.py プロジェクト: qmeeus/assist
def prepare_traindir(expdir, recipe):

    os.makedirs(expdir)

    for filename in [
            "acquisition.cfg", "coder.cfg", "train.cfg", "structure.xml"
    ]:
        logger.debug(f"Copy {filename} from {recipe} to {expdir}")
        shutil.copy(recipe / filename, expdir)

    trainconf = read_config(expdir / "train.cfg")
    dataconf = read_config(recipe / "database.cfg")

    if "fluent" in str(expdir).lower():
        convert_uttid = lambda spkr, uttid: (spkr if not uttid.startswith(spkr)
                                             else "") + uttid
    else:
        convert_uttid = lambda spkr, uttid: (spkr + "_" if not uttid.
                                             startswith(spkr) else "") + uttid

    logger.debug("Create trainfeats and traintasks files")
    with open(expdir / "trainfeats",
              "w") as feats, open(expdir / "traintasks", "w") as tasks:
        for section in trainconf.get("train", "datasections").split():
            with open(Path(dataconf.get(section, "features")) / "feats") as f:
                feats.writelines(
                    [convert_uttid(section, line) for line in f.readlines()])
            with open(Path(dataconf.get(section, "tasks"))) as f:
                tasks.writelines(
                    [convert_uttid(section, line) for line in f.readlines()])

    nfeats, ntasks = (subprocess.check_output(
        f"wc -l {expdir}/{filename}".split()).decode("utf-8").split()[0]
                      for filename in ("trainfeats", "traintasks"))
    logger.info(f"Written {nfeats} features and {ntasks} tasks to {expdir}")
コード例 #2
0
ファイル: finetune_slu.py プロジェクト: qmeeus/assist
    def train(self, train_set, valid_set, test_set=None):
        self.initialize()

        epoch_bar = trange(self.max_epochs, desc="epochs")
        logger.debug(
            f"Training for {self.max_epochs} epochs on {len(train_set)} examples"
        )
        history = defaultdict(list)
        for self._epoch in epoch_bar:
            log = self.train_one_epoch(train_set)
            log = self.evaluate(valid_set, log)
            [history[key].append(log) for key in log]

            epoch_log = self.on_epoch_end(log, progress_bar=epoch_bar)
            if self.early_stopping and self.early_stopping.should_stop(
                    self._epoch):
                logger.info("Early stopping reached")
                break
            for scheduler in self.schedulers:
                if isinstance(scheduler, lr_scheduler.ReduceLROnPlateau):
                    scheduler.step(log["val_loss"])

        with open(f"{self.checkpoint_dir}/history.json", "w") as f:
            json.dump(history, f)

        if test_set is not None:
            log = self.evaluate(test_set)
            with open(f"{self.checkpoint_dir}/results.json", "w") as f:
                json.dump(self.accumulate_metrics(log), f, indent=4)
コード例 #3
0
    def train_loop(self, ):
        train_loss = 0
        self.encoder.train()
        self.decoder.train()
        predictions, target = None, None
        for inputs, input_lengths, labels in tqdm(train_lr, total=len(train_set)//train_lr.batch_size):
            encoder_optimizer.zero_grad()
            decoder_optimizer.zero_grad()
            inputs = inputs.to(device)
            labels = labels.to(device)
            encodings, encoding_lengths = encoder.encode(inputs, input_lengths)
            loss, preds = decoder(encodings, encoding_lengths, labels=labels)
            loss.backward()
            logger.debug(f"loss={loss.item()}")
            encoder_optimizer.step()
            decoder_optimizer.step()
            train_loss += loss.item()
            preds = preds.detach().cpu().numpy()
            predictions = (
                preds if predictions is None
                else np.concatenate([predictions, preds], axis=0)
            )
            labels = labels.detach().cpu().numpy()
            target = (
                labels if target is None
                else np.concatenate([target, labels], axis=0)
            )

        train_loss = float(train_loss/len(train_set))
        train_error_rate = compute_error_rate(target, predictions)
        return {"loss": train_loss, "error_rate": train_error_rate}
コード例 #4
0
def encode_feats(expdir, featconf, dataconf, aggfunc=None):

    speaker = expdir.name
    target_file = Path(dataconf["features"])
    storage_type = featconf.get("features", "storage")
    model_string = featconf.get("features", "encoder")

    import torch
    from transformers import AutoTokenizer, AutoModel

    logger.info(f"Loading tokenizer and model: {model_string}")
    tokenizer = AutoTokenizer.from_pretrained(model_string)
    encoder = AutoModel.from_pretrained(model_string)
    encoder.eval()

    # HACK: Small workaround but would be better to use --cuda flag
    device = torch.device(
        "cuda" if os.environ.get("CUDA_VISIBLE_DEVICES", None) else "cpu")

    logger.debug(f"Device: {device}")
    encoder.to(device)

    for param in encoder.parameters():
        param.requires_grad = False

    with open(featconf.get("features", "file"), "r") as textfile:
        uttids, texts = map(
            list,
            zip(*map(
                parse_line,
                filter(lambda line: line.startswith(speaker),
                       textfile.readlines()))))

    inputs = tokenizer(texts,
                       add_special_tokens=False,
                       return_tensors="pt",
                       padding=True)

    input_lengths = inputs["attention_mask"].sum(-1)

    out = encoder(
        **{
            k: v.to(device) if isinstance(v, torch.Tensor) else v
            for k, v in inputs.items()
        })

    if len(out) == 2:
        data = out[featconf.get("features", "agg") == "pooled"]
    else:
        data, = out

    data = {
        uttid: data[i, :length].cpu().numpy()
        for i, (uttid, length) in enumerate(zip(uttids, input_lengths))
    }

    if aggfunc is not None:
        data = {uttid: aggfunc(array) for uttid, array in data.items()}

    save_features(data, target_file, storage_type)
コード例 #5
0
ファイル: train.py プロジェクト: qmeeus/assist
def prepare_subset(expdir, subset, dataconf):

    conf = read_config(expdir / f"{subset}.cfg")

    logger.debug(f"Create {subset}feats and {subset}tasks files")
    with open(expdir/f"{subset}feats", "w") as feats, \
            open(expdir/f"{subset}tasks", "w") as tasks:
        for section in conf.get(subset, "datasections").split():
            featfile, taskfile = (Path(dataconf.get(section, key))
                                  for key in ["features", "tasks"])
            scpfile = str(featfile).replace(featfile.suffix, ".scp")
            for filepath, outfile in zip((scpfile, taskfile), (feats, tasks)):
                with open(filepath) as f:
                    uttids, values = zip(*map(parse_line, f))
                    if not (uttids[0].startswith(section)):
                        # Make sure that uttid has format $speaker_$uttid
                        uttids = list(map(f"{section}_{{}}".format, uttids))

                    outfile.writelines([
                        f"{uttid} {value}\n"
                        for uttid, value in zip(uttids, values)
                    ])

    nfeats, ntasks = (subprocess.check_output(
        f"wc -l {expdir}/{filename}".split()).decode("utf-8").split()[0]
                      for filename in (f"{subset}feats", f"{subset}tasks"))
    logger.info(
        f"Written {nfeats} features and {ntasks} tasks to {expdir} ({subset})")
コード例 #6
0
ファイル: database.py プロジェクト: qmeeus/assist
def run_prepare_database(expdir,
                         recipe,
                         backend="local",
                         njobs=-1,
                         overwrite=False):

    logger.debug(f"Create {expdir}")
    os.makedirs(expdir)

    dataconf = read_config(recipe / "database.cfg")
    shutil.copy(recipe / "features.cfg", expdir / "features.cfg")

    speakers = dataconf.sections()
    nspeakers = len(speakers)

    mp_map(map_prepare_spkrdir, [expdir] * nspeakers,
           speakers, [dict(dataconf[spkr].items()) for spkr in speakers],
           njobs=nspeakers)

    spkrdirs = [expdir / speaker for speaker in speakers]

    if backend == "condor":
        condor_submit(expdir, "prepare_dataset", spkrdirs)

    else:
        mp_map(map_prepare_features, spkrdirs, njobs=njobs)
コード例 #7
0
ファイル: classifier.py プロジェクト: qmeeus/assist
 def train(self, examples, test_examples=None):
     logger.debug(f"{len(examples)} training examples")
     features, tasks = zip(*examples.values())
     target = [self.encode_target(task) for task in tasks]
     if test_examples is not None:
         test_feats, test_tasks = zip(*examples.values())
         test_target = [self.encode_target(task) for task in test_tasks]
         self.fit(features, target, (test_feats, test_target))
         return
     self.fit(features, target)
コード例 #8
0
 def train_one_step(self, inputs, labels, optimizer, lr_scheduler=None):
     if lr_scheduler is not None:
         lr_scheduler.step()
         logger.debug(f"LR: {lr_scheduler.get_lr()}")
     optimizer.zero_grad()
     *inputs, labels = self._recursive_to(*inputs, labels)
     loss, _ = self.model(*inputs, labels=labels)
     loss.backward()
     optimizer.step()
     return loss.item()
コード例 #9
0
def prepare_gridsearch(expdir, recipe):

    os.makedirs(expdir, exist_ok=True)
    for filename in [
            "param_grid.json", "gridsearch.cfg", "coder.cfg", "structure.xml"
    ]:
        logger.debug(f"Copy {filename} from {recipe} to {expdir}")
        shutil.copy(recipe / filename, expdir / filename)

    dataconf = read_config(recipe / "database.cfg")
    prepare_subset(expdir, "gridsearch", dataconf)
コード例 #10
0
def build_encoder(model_dir, freeze=None):
    options = load_args(model_dir)
    model, train_args = load_trained_model(options.resume)
    model.teacher_model = None
    if freeze:
        for m in freeze:
            logger.info(f"Freeze {m} in encoder")
            for p in getattr(model, m).parameters():
                p.requires_grad = False
    display_model(model, logger.info)
    logger.debug(train_args)
    return model
コード例 #11
0
ファイル: train.py プロジェクト: qmeeus/assist
def prepare_train(expdir, recipe):

    os.makedirs(expdir)

    for filename in ("acquisition.cfg", "coder.cfg", "train.cfg", "test.cfg",
                     "structure.xml"):
        logger.debug(f"Copy {filename} from {recipe} to {expdir}")
        shutil.copy(recipe / filename, expdir / filename)

    dataconf = read_config(recipe / "database.cfg")

    for subset in ("train", "test"):
        prepare_subset(expdir, subset, dataconf)
コード例 #12
0
ファイル: finetune_slu.py プロジェクト: qmeeus/assist
 def on_epoch_end(self, log, progress_bar=None):
     log = self.accumulate_metrics(log)
     if progress_bar:
         progress_bar.set_postfix({k: f"{v:.4f}" for k, v in log.items()})
     else:
         logger.debug(f"[{self._epoch+1}] " +
                      " ".join(f"[{k}={v:.4f}]" for k, v in log.items()))
     if self._epoch + 1 % self.save_interval == 0:
         self.save_checkpoint(f"{self._epoch:04d}")
     best = False
     if self.early_stopping and self.early_stopping(self._epoch, log,
                                                    self.model):
         self.save_checkpoint("best")
     return log
コード例 #13
0
ファイル: classifier.py プロジェクト: qmeeus/assist
    def __init__(
        self,
        config=None,
        coder=None,
        expdir=None
    ):

        if isinstance(config, ConfigParser):
            self.config = dict(config["acquisition"].items())
        else:
            self.config = config

        self.coder = coder
        self.expdir = expdir
        self.n_classes = self.config.get("output_dim", None) or coder.numlabels  # Backward compat
        self.model = self.build()
        logger.debug(f"Num classes: {self.n_classes}")
コード例 #14
0
def prepare_cross_validation(expdir, recipe):

    os.makedirs(expdir, exist_ok=True)
    for filename in ("acquisition.cfg", "coder.cfg", "structure.xml"):
        logger.debug(f"Copy {filename} from {recipe} to {expdir}")
        shutil.copy(recipe / filename, expdir / filename)

    expconf = dict(
        read_config(recipe / "cross_validation.cfg",
                    default=Path(__file__).parent /
                    "defaults/cross_validation.cfg").items("cross_validation"))

    random_seed = int(expconf.get("random_seed", 3105))
    logger.debug(f"Setting random seed to {random_seed}")
    random.seed(random_seed)

    dataconf = read_config(recipe / "database.cfg")
    coderconf = read_config(expdir / "coder.cfg")

    structure = Structure(expdir / 'structure.xml')
    Coder = coder_factory(coderconf.get("coder", "name"))
    coder = Coder(structure, coderconf)

    speakers = list(dataconf.sections())
    if "speakers" in expconf and expconf["speakers"]:
        speakers = list(
            filter(lambda spkr: spkr in expconf["speakers"], speakers))

    logger.info(f"{len(speakers)} speakers selected for cross-validation")

    option_list = [
        dict(expdir=expdir,
             speaker=speaker,
             coder=coder,
             dataconf=dataconf,
             expconf=expconf) for speaker in speakers
    ]

    for opts in option_list:
        map_prepare_filesystem(opts)
コード例 #15
0
ファイル: finetune_slu.py プロジェクト: qmeeus/assist
 def evaluate(self, dataset, log=None):
     batch_size = 32
     self.model.eval()
     log = log or defaultdict(int)
     valid_lr = self.make_dataloader(dataset, batch_size, train=False)
     iter_bar = tqdm(valid_lr,
                     desc="eval",
                     total=len(dataset) // batch_size,
                     leave=False)
     logger.debug(f"Evaluate {len(dataset)} examples")
     for batch in iter_bar:
         val_loss, metrics = self.eval_one_step(*batch)
         log["val_loss"] += float(val_loss)
         for metric, value in metrics.items():
             log[metric] += value
         iter_bar.set_postfix(
             dict(val_loss=f"{val_loss/batch_size:.3f}",
                  **{
                      k: f"{v:.3f}"
                      for k, v in self.accumulate_metrics(metrics).items()
                  }))
     log["val_loss"] /= len(dataset)
     return log
コード例 #16
0
ファイル: mlp.py プロジェクト: qmeeus/assist
    def train_loop(self, dataset):
        if self.iterations > 0:
            self.max_iter = self.iterations
            self.epochs = 0
        else:
            self.max_iter = int(
                np.ceil(self.epochs * len(dataset) / self.batch_size))
        assert any([self.epochs, self.iterations])
        logger.debug(f"Number of iterations: {self.iterations}")

        optimizer = torch.optim.Adam(self.model.parameters())

        progress_bar = tqdm(
            total=self.max_iter,
            bar_format=
            "{postfix[1][iter]}/{postfix[0]} loss={postfix[1][loss]:.4f}",
            postfix=[self.max_iter, {
                "iter": 0,
                "loss": float('inf')
            }])

        with progress_bar:
            iteration = 0
            while True:
                train_iter = iter(self.batch_iterator(dataset, is_train=True))
                iteration += 1
                *inputs, labels = next(train_iter)
                train_loss = self.train_one_step(inputs, labels,
                                                 optimizer) / len(labels)
                progress_bar.postfix[1].update({
                    "iter": iteration,
                    "loss": train_loss
                })
                progress_bar.update()
                if iteration == self.max_iter:
                    break
コード例 #17
0
def gs_learning_curve(expdir, recipe, cuda=True, n_jobs=1):
    logger.info(f"GridSearch {expdir}")

    with open(recipe / "param_grid.json") as jsonfile:
        param_grid = json.load(jsonfile)

    logger.debug(str(param_grid))
    total_params = np.prod(list(map(len, param_grid.values())))
    logger.warning(
        f"Searching {len(param_grid)} parameters, totalling {total_params} possible values."
    )

    gsconf = read_config(expdir / "gridsearch.cfg")
    default_config = dict(gsconf["acquisition"].items())
    default_config["device"] = "cuda" if cuda else "cpu"
    gsconf = dict(gsconf["gridsearch"].items())
    logger.debug(" ".join(f"{k}={v}" for k, v in gsconf.items()))
    train_sizes = np.linspace(float(gsconf["nmin"]), float(gsconf["nmax"]),
                              int(gsconf["num_trains"]))
    gs_params = {
        "train_sizes":
        train_sizes,
        "cv":
        int(gsconf["cv_splits"]),
        "scoring":
        make_scorer(accuracy)
        if gsconf["scoring"] == "accuracy" else gsconf["scoring"],
        "n_jobs":
        n_jobs
    }
    logger.debug(gs_params)

    coderconf = read_config(expdir / "coder.cfg")
    structure = Structure(expdir / 'structure.xml')
    Coder = coder_factory(coderconf.get('coder', 'name'))
    coder = Coder(structure, coderconf)
    default_config["output_dim"] = coder.numlabels

    features = FeatLoader(expdir / "gridsearchfeats").to_dict()
    with open(expdir / "gridsearchtasks") as traintasks:
        taskstrings = {
            uttid: task
            for uttid, task in map(parse_line, traintasks.readlines())
        }

    indices = sorted(set(features).intersection(set(taskstrings)))
    X = list(map(features.__getitem__, indices))
    y = list(
        map(coder.encode, map(read_task, map(taskstrings.__getitem__,
                                             indices))))

    gs_results = defaultdict(list)
    start = time()
    best_score = 0
    for i, param_values in enumerate(product(*param_grid.values())):

        t0 = time()
        params = dict(zip(param_grid.keys(), param_values))
        config = deepcopy(default_config)
        config.update(params)
        logger.debug(config)

        model = RNNClassifier(**config)

        train_sizes, train_scores, valid_scores = learning_curve(
            model, X, y, **gs_params)

        train_score = auc(train_sizes, train_scores.mean(-1))
        test_score = auc(train_sizes, valid_scores.mean(-1))
        t1 = time()
        logger.info(
            f"model {i+1}/{total_params}: train={train_score:.3%} test={test_score:.3%} "
            f"time={t1 - t0:.1f}s elapsed={t1-start:.1f}s {params}")
        gs_results["auc_test_score"].append(test_score)
        gs_results["auc_train_score"].append(train_score)
        gs_results["params"].append(params)
        gs_results["train_sizes"].append(train_sizes)
        gs_results["train_scores"].append(train_scores)
        gs_results["test_scores"].append(valid_scores)

        if test_score > best_score:
            best_params, best_score, best_index = params, test_score, i

    logger.warning(
        f"Search completed in {time() - start:.2f}s. Best model: {best_params} ({best_score:.2%})"
    )
    logger.warning(
        f"Test scores: {gs_results['test_scores'][best_index].mean(-1)}")

    with open(expdir / "gs_results.json", "w") as result_file:
        json.dump(
            {
                "best_params": best_params,
                "best_score": best_score,
                "cv_results": serialise(gs_results)
            }, result_file)
コード例 #18
0
def main(expdir, cuda):

    expdir = Path(expdir)
    if (expdir / "f1").exists():
        logger.info(f"Results found at {expdir}")
        return

    logger.info(f"Evaluate {expdir}")

    acquisitionconf = tools.read_config(expdir / "acquisition.cfg")
    acquisitionconf.set("acquisition", "device", "cuda" if cuda else "cpu")
    coderconf = tools.read_config(expdir / "coder.cfg")
    structure = Structure(expdir / "structure.xml")

    Coder = coder_factory(coderconf.get('coder', 'name'))
    coder = Coder(structure, coderconf)
    Model = model_factory(acquisitionconf.get('acquisition', 'name'))
    model = Model(acquisitionconf, coder, expdir)
    logger.debug(f"Loading model at {expdir}/model")
    model.load(expdir / 'model')

    with open(expdir / "testfeats") as testfeats:
        features = {
            line[0]: np.load(line[1])
            for line in map(tools.parse_line, testfeats.readlines())
        }

    with open(expdir / "testtasks") as testtasks:
        references = {
            key: read_task(value)
            for key, value in map(tools.parse_line, testtasks.readlines())
            if key in features
        }

    assert len(features) == len(references)

    #decode the test utterances
    feats = deepcopy(features)
    errors, nans, too_small = 0, 0, 0
    for uttid, feat in feats.items():
        remove = False
        # if feat.shape[0] < 5:
        #     too_small += 1
        #     remove = True
        if not np.isfinite(feat).all():
            nans += 1
            remove = True
        if remove:
            logger.debug(f"Removing {uttid}")
            errors += 1
            del features[uttid]
            del references[uttid]

    if errors > 0:
        logger.warning(
            f"{errors}/{len(feats)} utts removed ({too_small} too small and {nans} contained NaN)"
        )

    decoded = model.decode(features)

    with open(expdir / "dectasks", "w") as dectasks_file:
        dectasks_file.writelines(
            [f"{name}  {to_string(task)}\n" for name, task in decoded.items()])

    metric_names = [
        "precision", "recal", "f1", "macro precision", "macro recal",
        "macro f1"
    ]
    metrics, scores = score(decoded, references)

    for metric_name, metric in zip(metric_names, metrics):
        logger.info(f"{metric_name}: {metric:.4f}")
        with open(expdir / metric_name.replace(" ", ""), "w") as f:
            f.write(str(metric))

    write_scores(scores, expdir)
コード例 #19
0
def train(
    encoder, decoder,
    train_set, valid_set,
    enc_ckpt, dec_ckpt,
    max_epochs=20,
    batch_size=64,
    enc_lr=1e-5, dec_lr=1e-2,
    es_patience=5,
    es_criterion="val_loss",
    es_lower_is_better=True,
    device="cuda",
    enc_update_interval=1
):

    encoder.to(device)
    decoder.to(device)

    encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=enc_lr)
    decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=dec_lr)

    best_error_rate = float("inf")
    best_loss = float("inf")
    best_epoch = 0
    epoch = 0
    t0 = time()

    def should_stop(epoch):
        return (epoch - best_epoch) > es_patience or epoch >= max_epochs

    def check_early_stopping(epoch, loss, error_rate):
        nonlocal best_error_rate, best_loss, best_epoch
        if es_criterion == "val_loss":
            metric, prev = loss, best_loss
        else:
            metric, prev = error_rate, best_error_rate

        if (metric - prev) * (-1 if es_lower_is_better else 1) > 0:
            best_epoch = epoch
            best_loss = loss
            best_error_rate = error_rate
            torch.save(encoder, enc_ckpt)
            torch.save(decoder, dec_ckpt)


    while True:
        train_lr, val_lr = (DataLoader(
            dataset,
            batch_size=batch_size,
            shuffle=not i,
            collate_fn=dataset.data_collator
        ) for i, dataset in enumerate([train_set, valid_set]))

        train_loss, val_loss = 0, 0
        encoder.train()
        decoder.train()
        predictions, target = None, None
        for i, (inputs, input_lengths, labels) in enumerate(tqdm(train_lr, total=len(train_set)//train_lr.batch_size), 1):
            encoder_optimizer.zero_grad()
            decoder_optimizer.zero_grad()
            inputs = inputs.to(device)
            labels = labels.to(device)
            encodings, encoding_lengths = encoder.encode(inputs, input_lengths)
            loss, preds = decoder(encodings, encoding_lengths, labels=labels)
            loss.backward()
            logger.debug(f"loss={loss.item()}")
            if i % enc_update_interval == 0:
                encoder_optimizer.step()
            decoder_optimizer.step()
            train_loss += loss.item()
            preds = preds.detach().cpu().numpy()
            predictions = (
                preds if predictions is None
                else np.concatenate([predictions, preds], axis=0)
            )
            labels = labels.detach().cpu().numpy()
            target = (
                labels if target is None
                else np.concatenate([target, labels], axis=0)
            )

        train_loss = float(train_loss/len(train_set))
        train_error_rate = compute_error_rate(target, predictions)

        predictions, target = None, None
        with torch.no_grad():
            encoder.eval()
            decoder.eval()
            for inputs, input_lengths, labels in val_lr:
                inputs = inputs.to(device)
                labels = labels.to(device)
                encodings, encoding_lengths = encoder.encode(inputs, input_lengths)
                loss, preds = decoder(encodings, encoding_lengths, labels=labels)
                val_loss += loss.item()
                preds = preds.detach().cpu().numpy()
                predictions = (
                    preds if predictions is None
                    else np.concatenate([predictions, preds], axis=0)
                )
                labels = labels.detach().cpu().numpy()
                target = (
                    labels if target is None
                    else np.concatenate([target, labels], axis=0)
                )

        val_loss = float(val_loss/len(valid_set))
        val_error_rate = compute_error_rate(target, predictions)

        epoch += 1
        logger.info(f"Epoch {epoch} [ TRAIN {train_loss:.4f} {train_error_rate:.4f} ] "
                    f"[ VALID {val_loss:.4f} {val_error_rate:.4f} ]")

        check_early_stopping(epoch, val_loss, val_error_rate)

        if should_stop(epoch):
            break

    logger.info(f"Training completed in {time() - t0:.2f} s. "
                f"Best epoch: {best_epoch} Loss: {best_loss:.3f} ER: {best_error_rate:.4%}")
コード例 #20
0
def prepare_filesystem(expdir, speaker, coder, dataconf, expconf):

    speaker_dir = expdir / speaker
    os.makedirs(speaker_dir, exist_ok=True)

    feature_file = Path(dataconf.get(speaker, 'features'))
    with open(str(feature_file).replace(feature_file.suffix,
                                        ".scp")) as featfile:
        features = dict(map(parse_line, featfile.readlines()))

    with open(dataconf.get(speaker, "tasks")) as taskfile:
        task_strings = {
            f"{speaker}_{uttid}": task
            for uttid, task in map(parse_line, taskfile.readlines())
        }
        for uttid in list(task_strings):
            if uttid not in features:
                logger.warning(f"Missing utterance speaker {speaker}: {uttid}")
                del task_strings[uttid]

        tasks = [
            coder.encode(read_task(task)) for task in task_strings.values()
        ]

    if not tasks:
        logger.error(f"Error with speaker {speaker}: no tasks")
        return []

    tasks = np.array(tasks)
    blocks_path = speaker_dir / "blocks.pkl"
    if blocks_path.exists():
        with open(blocks_path, "rb") as blockfile:
            blocks = pickle.load(blockfile)
    else:
        try:
            blocks = make_blocks(tasks, expconf, feature_file.parent)
        except Exception as err:
            logger.error(f"Error with speaker {speaker}: {err}")
            return []
        with open(blocks_path, "wb") as blockfile:
            pickle.dump(blocks, blockfile)

    num_exp = int(expconf["numexp"])

    train_ids, test_ids = [], []
    for block_id in range(len(blocks) - 1):
        train_ids.append([])
        test_ids.append([])
        for exp_id in range(num_exp):
            train_ids[-1].append(
                list(
                    itertools.chain.from_iterable(
                        random.sample(blocks, block_id + 1))))
            test_ids[-1].append(
                [i for i in range(len(tasks)) if i not in train_ids[-1][-1]])

    if not (train_ids and test_ids):
        logger.error(f"Error with speaker {speaker}: no utterances")
        return []

    uttids = list(task_strings)
    block_id = int(expconf['startblocks']) - 1
    while True:

        dirname = f"{block_id + 1}blocks_exp"
        num_exp = int(expconf['numexp'])
        for exp_id in range(num_exp):

            subexpdir = expdir / speaker / (dirname + str(exp_id))
            logger.debug(f"Experiment {subexpdir.name}")

            if (subexpdir / "f1").exists():
                logger.info(f"Skipping {subexpdir}")
                continue

            os.makedirs(subexpdir, exist_ok=True)

            for filename in ("acquisition.cfg", "coder.cfg", "structure.xml"):
                symlink(f"../../{filename}",
                        subexpdir / filename,
                        relative=True)

            if not (subexpdir / "trainfeats").exists():
                for subset, ids in [("train", train_ids), ("test", test_ids)]:
                    utts = [
                        uttids[idx] for idx in ids[block_id][exp_id]
                        if idx < len(uttids)
                    ]
                    if len(utts) != len(ids[block_id][exp_id]):
                        num_lost = len(ids[block_id][exp_id]) - len(utts)
                        logger.warning(f"Lost {num_lost} {subset} utterances")
                    logger.debug(f"Number of {subset} examples: {len(utts):,}")

                    writefile(subexpdir / f"{subset}feats",
                              {utt: features[utt]
                               for utt in utts})
                    writefile(subexpdir / f"{subset}tasks",
                              {utt: task_strings[utt]
                               for utt in utts})

        next_block_id = (block_id + 1) * int(expconf['scale']) + int(
            expconf['increment']) - 1
        next_block_id = min(next_block_id, len(blocks) - 2)
        if block_id == next_block_id:
            break
        else:
            block_id = next_block_id
コード例 #21
0
ファイル: make_blocks.py プロジェクト: qmeeus/assist
def make_blocks(labelmat, conf, blocksdir):
    '''
    devides the data into blocks of similar content by minimising the
    Jensen-Channon divergence

    args:
        labelmat: the label matrix of shape [numutt x numlabels]
        conf: the experiments configuration
        blocksdir: the directory where blocks are stored

    returns:
        - the data blocks as a list containing lists of utterance indices
    '''

    req_blocks = int(conf["numblocks"])
    min_blocks = int(conf["minblocks"])
    all_labels = conf['alllabels'].lower() == "true"
    balanced_blocks = conf['balancedblocks'].lower() == 'true'
    blocksdir = Path(blocksdir)
    os.makedirs(blocksdir, exist_ok=True)

    # initialise numblocks as the requested number of blocks
    nblocks = min(req_blocks, labelmat.shape[0])
    labelmat = labelmat.astype(int)

    if all_labels:
        # ignore labels that have less than the minimum amount of labels
        to_count, = np.where(labelmat.sum(0) >= min_blocks)
        nblocks = int(min(nblocks, np.min(labelmat[:, to_count].sum(0))))

    while True:

        # check if the minimum number of blocks has been reached
        if nblocks < min_blocks:
            raise ValueError(f'Failed to create {min_blocks} blocks')

        blocksfile = blocksdir / f'{nblocks}blocks.pkl'
        if os.path.exists(blocksfile):
            with open(blocksfile, 'rb') as fid:
                return pickle.load(fid)

        else:

            # compute the average distribution of labels
            Tdist = np.sum(labelmat, 0) / np.sum(labelmat)

            # make a random initialisation for the blocks
            ind = list(np.random.permutation(range(labelmat.shape[0])))

            blocks = [
                ind[int(i * labelmat.shape[0] /
                        nblocks):int((i + 1) * labelmat.shape[0] / nblocks)]
                for i in range(nblocks)
            ]

            # compute the label counts in all blocks
            clab = [np.sum(labelmat[blocks[b], :], 0) for b in range(nblocks)]
            dist = [clab[b] / np.sum(clab[b]) for b in range(nblocks)]

            # compute the initial KLD to the mean for all blocks
            KLD = [
                np.sum(dist[b][np.nonzero(dist[b])] * np.log(
                    dist[b][np.nonzero(dist[b])] / Tdist[np.nonzero(dist[b])]))
                for b in range(nblocks)
            ]

            # compute the gains for removing an utterance from a block
            remove_gains = np.zeros(labelmat.shape[0])
            swap_gains = np.zeros([labelmat.shape[0], nblocks])
            for b1 in range(nblocks):
                for u in blocks[b1]:
                    cb = clab[b1] - labelmat[u, :]
                    dist = cb / np.sum(cb)
                    remove_gains[u] = (
                        KLD[b1] - np.sum(dist[np.nonzero(dist)] * np.log(
                            dist[np.nonzero(dist)] / Tdist[np.nonzero(dist)])))
                    for b2 in range(nblocks):
                        if b1 != b2:
                            cb = clab[b2] + labelmat[u, :]
                            dist = cb / np.sum(cb)
                            swap_gains[u, b2] = (
                                KLD[b2] -
                                np.sum(dist[np.nonzero(dist)] *
                                       np.log(dist[np.nonzero(dist)] /
                                              Tdist[np.nonzero(dist)])))

            # compute the complete gains for al the moves
            gains = remove_gains[:, np.newaxis] + swap_gains
            # remove the elements wher utterances stay in the same block
            for b in range(nblocks):
                gains[blocks[b], b] = 0

            #find the best swap
            I = np.argmax(gains)
            uc = I // nblocks
            bt = I % nblocks

            KLD_track = [sum(KLD)]

            while gains[uc, bt] > 0 and balanced_blocks:
                # find the originating block
                bo = [uc in b for b in blocks].index(True)

                # apply the change
                blocks[bt].append(uc)
                del blocks[bo][blocks[bo].index(uc)]

                # update the counts for the relevant blocks
                clab[bo] = clab[bo] - labelmat[uc, :]
                clab[bt] = clab[bt] + labelmat[uc, :]

                # update the costs for the relevant block
                dist = clab[bo] / np.sum(clab[bo])
                KLD[bo] = np.sum(
                    dist[np.nonzero(dist)] *
                    np.log(dist[np.nonzero(dist)] / Tdist[np.nonzero(dist)]))
                dist = clab[bt] / np.sum(clab[bt])
                KLD[bt] = np.sum(
                    dist[np.nonzero(dist)] *
                    np.log(dist[np.nonzero(dist)] / Tdist[np.nonzero(dist)]))
                KLD_track.append(sum(KLD))

                # update the remove gains for the utterances in the relevant
                # blocks
                for b in [bo, bt]:
                    for u in blocks[b]:
                        cb = clab[b] - labelmat[u, :]
                        dist = cb / np.sum(cb)
                        remove_gains[u] = (
                            KLD[b] - np.sum(dist[np.nonzero(dist)] *
                                            np.log(dist[np.nonzero(dist)] /
                                                   Tdist[np.nonzero(dist)])))

                # update the swap costs for all the utterances to the relevant
                # blocks
                swap_gains[uc, bt] = 0
                for b1 in range(nblocks):
                    for b2 in [bt, bo]:
                        if b1 != b2:
                            for u in blocks[b1]:
                                cb = clab[b2] + labelmat[u, :]
                                dist = cb / np.sum(cb)
                                swap_gains[u, b2] = (
                                    KLD[b2] -
                                    np.sum(dist[np.nonzero(dist)] *
                                           np.log(dist[np.nonzero(dist)] /
                                                  Tdist[np.nonzero(dist)])))

                # compute the complete gains for al the moves
                gains = remove_gains[:, np.newaxis] + swap_gains
                # remove the elements wher utterances stay in the same block
                for b in range(nblocks):
                    gains[blocks[b], b] = 0

                #find the best swap
                I = np.argmax(gains)
                uc = I // nblocks
                bt = I % nblocks

            # there are no more changes with gain, check if all labels occur in
            # all blocks
            if (not any([any(clab[b][to_count] == 0) for b in range(nblocks)])
                    or not (all_labels)):

                with open(blocksfile, 'wb') as fid:
                    pickle.dump(blocks, fid)

                break

            # if there are blocks that don't have all labels decrement the number
            # of blocks and start over
            nblocks -= 1

    logger.debug(f"Created {nblocks} blocks (requested: {req_blocks})")
    return blocks
コード例 #22
0
ファイル: finetune_slu.py プロジェクト: qmeeus/assist
 def save_checkpoint(self, suffix):
     path = f"{self.checkpoint_dir}/ckpt-{suffix}.pt"
     logger.debug(f"Saving model to {path}")
     torch.save(self.model, path)