Esempio n. 1
0
def load_data(options):

    structure = Structure(options.configdir / "structure.xml")
    coderconf = read_config(options.configdir / "coder.cfg")
    Coder = coder_factory(coderconf.get("coder", "name"))
    coder = Coder(structure, coderconf)

    dataconf = read_config(options.configdir / "database.cfg")

    features = {
        uttid: feats
        for spkr in dataconf.sections()
        for uttid, feats in np.load(dataconf[spkr].get("features")).items()
    }

    labels = {
        uttid: coder.encode(read_task(task))
        for spkr in dataconf.sections()
        for uttid, task in zip(*load_tasks(Path(dataconf[spkr].get("tasks"))))
    }

    errors = set(features).union(set(labels)) - set(features).intersection(
        set(labels))
    if errors:
        msg = f"{len(errors)} mismatches ({len(features)} features and {len(labels)} labels)"
        if options.errors == "raise":
            raise Exception(msg)
        elif options.errors == "warn":
            warning.warn(msg)

    uttids = list(features)
    features = [features[uttid] for uttid in uttids]
    labels = [labels[uttid] for uttid in uttids]
    return SequenceDataset(features, labels)
Esempio n. 2
0
def train(expdir, cuda=False, do_eval=True):
    logger.info(f"Train {expdir}")

    acquisitionconf = read_config(expdir / "acquisition.cfg")
    acquisitionconf.set("acquisition", "device", "cuda" if cuda else "cpu")

    coderconf = read_config(expdir / "coder.cfg")
    structure = Structure(expdir / 'structure.xml')
    Coder = coder_factory(coderconf.get('coder', 'name'))
    coder = Coder(structure, coderconf)

    Model = model_factory(acquisitionconf.get('acquisition', 'name'))
    model = Model(acquisitionconf, coder, expdir)
    model.display(logger.info)

    trainfeats = FeatLoader(expdir / "trainfeats").to_dict()

    with open(expdir / "traintasks") as f:
        traintasks = dict(map(parse_line, f))

    train_set = {
        utt: (trainfeats[utt], traintasks[utt])
        for utt in traintasks if utt in trainfeats
    }

    if not train_set:
        raise ValueError("No training examples")

    test_feats = FeatLoader(expdir / "testfeats").to_dict()

    with open(expdir / "testtasks") as testtasks:
        test_tasks = dict(map(parse_line, testtasks))

    test_set = {
        utt: (test_feats[utt], test_tasks[utt])
        for utt in test_tasks if utt in test_feats
    }

    if (expdir / "model").exists():
        model.load(expdir / "model")

    model.train(train_set, test_set)
    model.save(expdir / 'model')

    # from assist.tasks import read_task
    # from sklearn.metrics import classification_report, log_loss
    # from operator import itemgetter
    # from functools import partial

    # predictions = model.encode(model._decode(list(map(itemgetter(0), train_set.values()))))
    # target = model.encode(list(map(itemgetter(1), train_set.values())))

    # for line in classification_report(target, predictions).split("\n"):
    #     logger.info(line)

    if do_eval:
        evaluate(expdir, cuda=cuda)
Esempio n. 3
0
def main(expdir, cuda):

    expdir = Path(expdir)

    #check if this experiment has been completed
    if (expdir / "model").exists():
        logger.warning(f"Found trained model in {expdir}.")
        return

    acquisitionconf = read_config(expdir / "acquisition.cfg")
    acquisitionconf.set("acquisition", "device", "cuda" if cuda else "cpu")

    coderconf = read_config(expdir / "coder.cfg")
    structure = Structure(os.path.join(expdir, 'structure.xml'))
    Coder = coder_factory(coderconf.get('coder', 'name'))
    coder = Coder(structure, coderconf)

    Model = model_factory(acquisitionconf.get('acquisition', 'name'))
    model = Model(acquisitionconf, coder, expdir)

    trainconf = dict(
        read_config(expdir / "train.cfg",
                    default=Path(__file__).parent /
                    "defaults/train.cfg").items("train"))

    with open(expdir / "trainfeats") as trainfeats:
        features = {
            uttid: np.load(featsfile)
            for uttid, featsfile in map(parse_line, trainfeats.readlines())
        }

    with open(expdir / "traintasks") as traintasks:
        taskstrings = {
            uttid: task
            for uttid, task in map(parse_line, traintasks.readlines())
        }

    examples = {
        utt: (features[utt], taskstrings[utt])
        for utt in taskstrings if utt in features
    }
    model.train(examples)
    model.save(expdir / 'model')
Esempio n. 4
0
def prepare_cross_validation(expdir, recipe):

    os.makedirs(expdir, exist_ok=True)
    for filename in ("acquisition.cfg", "coder.cfg", "structure.xml"):
        logger.debug(f"Copy {filename} from {recipe} to {expdir}")
        shutil.copy(recipe / filename, expdir / filename)

    expconf = dict(
        read_config(recipe / "cross_validation.cfg",
                    default=Path(__file__).parent /
                    "defaults/cross_validation.cfg").items("cross_validation"))

    random_seed = int(expconf.get("random_seed", 3105))
    logger.debug(f"Setting random seed to {random_seed}")
    random.seed(random_seed)

    dataconf = read_config(recipe / "database.cfg")
    coderconf = read_config(expdir / "coder.cfg")

    structure = Structure(expdir / 'structure.xml')
    Coder = coder_factory(coderconf.get("coder", "name"))
    coder = Coder(structure, coderconf)

    speakers = list(dataconf.sections())
    if "speakers" in expconf and expconf["speakers"]:
        speakers = list(
            filter(lambda spkr: spkr in expconf["speakers"], speakers))

    logger.info(f"{len(speakers)} speakers selected for cross-validation")

    option_list = [
        dict(expdir=expdir,
             speaker=speaker,
             coder=coder,
             dataconf=dataconf,
             expconf=expconf) for speaker in speakers
    ]

    for opts in option_list:
        map_prepare_filesystem(opts)
Esempio n. 5
0
def gs_learning_curve(expdir, recipe, cuda=True, n_jobs=1):
    logger.info(f"GridSearch {expdir}")

    with open(recipe / "param_grid.json") as jsonfile:
        param_grid = json.load(jsonfile)

    logger.debug(str(param_grid))
    total_params = np.prod(list(map(len, param_grid.values())))
    logger.warning(
        f"Searching {len(param_grid)} parameters, totalling {total_params} possible values."
    )

    gsconf = read_config(expdir / "gridsearch.cfg")
    default_config = dict(gsconf["acquisition"].items())
    default_config["device"] = "cuda" if cuda else "cpu"
    gsconf = dict(gsconf["gridsearch"].items())
    logger.debug(" ".join(f"{k}={v}" for k, v in gsconf.items()))
    train_sizes = np.linspace(float(gsconf["nmin"]), float(gsconf["nmax"]),
                              int(gsconf["num_trains"]))
    gs_params = {
        "train_sizes":
        train_sizes,
        "cv":
        int(gsconf["cv_splits"]),
        "scoring":
        make_scorer(accuracy)
        if gsconf["scoring"] == "accuracy" else gsconf["scoring"],
        "n_jobs":
        n_jobs
    }
    logger.debug(gs_params)

    coderconf = read_config(expdir / "coder.cfg")
    structure = Structure(expdir / 'structure.xml')
    Coder = coder_factory(coderconf.get('coder', 'name'))
    coder = Coder(structure, coderconf)
    default_config["output_dim"] = coder.numlabels

    features = FeatLoader(expdir / "gridsearchfeats").to_dict()
    with open(expdir / "gridsearchtasks") as traintasks:
        taskstrings = {
            uttid: task
            for uttid, task in map(parse_line, traintasks.readlines())
        }

    indices = sorted(set(features).intersection(set(taskstrings)))
    X = list(map(features.__getitem__, indices))
    y = list(
        map(coder.encode, map(read_task, map(taskstrings.__getitem__,
                                             indices))))

    gs_results = defaultdict(list)
    start = time()
    best_score = 0
    for i, param_values in enumerate(product(*param_grid.values())):

        t0 = time()
        params = dict(zip(param_grid.keys(), param_values))
        config = deepcopy(default_config)
        config.update(params)
        logger.debug(config)

        model = RNNClassifier(**config)

        train_sizes, train_scores, valid_scores = learning_curve(
            model, X, y, **gs_params)

        train_score = auc(train_sizes, train_scores.mean(-1))
        test_score = auc(train_sizes, valid_scores.mean(-1))
        t1 = time()
        logger.info(
            f"model {i+1}/{total_params}: train={train_score:.3%} test={test_score:.3%} "
            f"time={t1 - t0:.1f}s elapsed={t1-start:.1f}s {params}")
        gs_results["auc_test_score"].append(test_score)
        gs_results["auc_train_score"].append(train_score)
        gs_results["params"].append(params)
        gs_results["train_sizes"].append(train_sizes)
        gs_results["train_scores"].append(train_scores)
        gs_results["test_scores"].append(valid_scores)

        if test_score > best_score:
            best_params, best_score, best_index = params, test_score, i

    logger.warning(
        f"Search completed in {time() - start:.2f}s. Best model: {best_params} ({best_score:.2%})"
    )
    logger.warning(
        f"Test scores: {gs_results['test_scores'][best_index].mean(-1)}")

    with open(expdir / "gs_results.json", "w") as result_file:
        json.dump(
            {
                "best_params": best_params,
                "best_score": best_score,
                "cv_results": serialise(gs_results)
            }, result_file)
Esempio n. 6
0
def load_data(options):

    logger.info("Loading data")
    structure = Structure(options.expdir/"structure.xml")
    coderconf = read_config(options.expdir/"coder.cfg")
    Coder = coder_factory(coderconf.get("coder", "name"))
    coder = Coder(structure, coderconf)
    featconf = read_config(options.expdir/"features.cfg")
    dataconf = read_config(options.expdir/"database.cfg")

    def load_tasks(filename):
        with open(filename) as f:
            spkr = filename.stem

            if spkr == "tasks":
                spkr = filename.parent.name

            uttids, tasks = map(list, zip(*map(
                lambda s: s.split(maxsplit=1),
                map(str.strip, f.readlines()))
            ))

        return list(map(f"{spkr}_{{}}".format, uttids)), tasks

    feat_loader = kaldiio.load_scp(featconf["features"].get("fbanks"))
    features = {uttid: feat_loader[uttid] for uttid in feat_loader}

    labels = {
        uttid: coder.encode(read_task(task))
        for spkr in dataconf.sections()
        for uttid, task in zip(*load_tasks(Path(dataconf[spkr].get("tasks"))))
    }

    errors = set(features).union(set(labels)) - set(features).intersection(set(labels))
    if errors:
        msg = f"{len(errors)} mismatches ({len(features)} features and {len(labels)} labels)"
        if options.errors == "raise":
            raise Exception(msg)
        elif options.errors == "warn":
            logger.warn(msg)

    uttids = np.array(list(features))
    features = np.array([features[uttid] for uttid in uttids], dtype="object")
    labels = np.array([labels[uttid] for uttid in uttids])

    if any(subset in uttids[0] for subset in ["train", "valid", "test"]):
        train_mask = np.array(list(map(lambda s: "_train_" in s, uttids)))
        valid_mask = np.array(list(map(lambda s: "_valid_" in s, uttids)))
        # test_mask = np.array(list(map(lambda s: "_test_" in s, uttids)))

    elif (options.expdir/"train.cfg").exists():
        train_sections = set(read_config(options.expdir/"train.cfg").get("train", "datasections").split())
        test_sections = set(read_config(options.expdir/"test.cfg").get("test", "datasections").split())
        train_mask = np.array(list(map(lambda s: s.split("_")[0] in train_sections, uttids)))
        valid_mask = np.array(list(map(lambda s: s.split("_")[0] in test_sections, uttids)))

    else:
        train_ids, valid_ids = train_test_split(uttids, test_size=0.1)
        train_mask = np.array(list(map(lambda s: s in train_ids, uttids)))
        valid_mask = np.array(list(map(lambda s: s in valid_ids, uttids)))

    train_set = SequenceDataset(features[train_mask], labels[train_mask], uttids[train_mask])
    valid_set = SequenceDataset(features[valid_mask], labels[valid_mask], uttids[valid_mask])
    # test_set = SequenceDataset(features[test_mask], labels[test_mask], uttids[test_mask])

    return train_set, valid_set
Esempio n. 7
0
def load_data(options):

    if options.expdir is None:
        options.expdir = options.outdir

    dataconf = read_config(options.expdir / "database.cfg")
    structure = Structure(options.expdir / "structure.xml")
    coderconf = read_config(options.expdir / "coder.cfg")
    Coder = coder_factory(coderconf.get("coder", "name"))
    coder = Coder(structure, coderconf)

    if options.expdir == options.outdir:
        trainfeats = FeatLoader(options.outdir / "trainfeats").to_dict()
        testfeats = FeatLoader(options.outdir / "testfeats").to_dict()
        with open(options.outdir / "traintasks") as traintasks:
            trainlabels = {
                uttid: coder.encode(read_task(task))
                for uttid, task in map(parse_line, traintasks.readlines())
            }

        with open(options.outdir / "testtasks") as testtasks:
            testlabels = {
                uttid: coder.encode(read_task(task))
                for uttid, task in map(parse_line, testtasks.readlines())
            }

        features = set(trainfeats).union(set(testfeats))
        labels = set(trainlabels).union(set(testlabels))

    else:

        def load_tasks(filename):
            with open(filename) as f:
                spkr = filename.stem

                if spkr == "tasks":
                    spkr = filename.parent.name

                uttids, tasks = map(
                    list,
                    zip(*map(lambda s: s.split(maxsplit=1),
                             map(str.strip, f.readlines()))))

            if not uttids[0].startswith(spkr):
                uttids = list(map(f"{spkr}_{{}}".format, uttids))
            return uttids, tasks

        features = {
            uttid: feats
            for spkr in dataconf.sections() for uttid, feats in np.load(
                dataconf[spkr].get("features")).items()
        }

        labels = {
            uttid: coder.encode(read_task(task))
            for spkr in dataconf.sections() for uttid, task in zip(
                *load_tasks(Path(dataconf[spkr].get("tasks"))))
        }

    errors = set(features).union(set(labels)) - set(features).intersection(
        set(labels))
    if errors:
        msg = f"{len(errors)} mismatches ({len(features)} features and {len(labels)} labels)"
        if options.errors == "raise":
            raise Exception(msg)
        elif options.errors == "warn":
            warning.warn(msg)
        else:
            # import ipdb; ipdb.set_trace()
            features = {k: v for k, v in features.items() if k not in errors}
            labels = {k: v for k, v in labels.items() if k not in errors}
            if not (features and labels):
                raise ValueError("No examples left after removing errors")

    if options.expdir == options.outdir:
        trainuttids = set(trainfeats)
        trainfeats = np.array([trainfeats[uttid] for uttid in trainuttids],
                              dtype="object")
        trainlabels = np.array([trainlabels[uttid] for uttid in trainuttids])
        train_set = SequenceDataset(trainfeats, trainlabels)

        testuttids = set(testfeats)
        testfeats = np.array([testfeats[uttid] for uttid in testuttids],
                             dtype="object")
        testlabels = np.array([testlabels[uttid] for uttid in testuttids])
        valid_set = SequenceDataset(testfeats, testlabels)

        return train_set, valid_set

    uttids = np.array(list(features))
    features = np.array([features[uttid] for uttid in uttids], dtype="object")
    labels = np.array([labels[uttid] for uttid in uttids])

    if options.method == "10-fold":
        return SequenceDataset(features, labels, indices=uttids)

    # 1. Fluent Speech Commands
    if any(subset in uttids[0] for subset in ["train", "valid", "test"]):
        logger.info("Fluent Speech Commands dataset splits")
        train_mask = np.array(list(map(lambda s: "_train_" in s, uttids)))
        valid_mask = np.array(list(map(lambda s: "_valid_" in s, uttids)))
        # test_mask = np.array(list(map(lambda s: "_test_" in s, uttids)))

    # 2. Train/test split exists in expdir
    elif (options.expdir / "train.cfg").exists():
        logger.info(
            f"Loading dataset splits from spec {options.expdir}/{{train,test}}.cfg"
        )
        train_sections = set(
            read_config(options.expdir / "train.cfg").get(
                "train", "datasections").split())
        test_sections = set(
            read_config(options.expdir / "test.cfg").get(
                "test", "datasections").split())

        def make_filter(sections):
            def _filter(uttid):
                return any(uttid.startswith(spkr) for spkr in sections)

            return _filter

        train_mask = np.array(list(map(make_filter(train_sections), uttids)))
        valid_mask = np.array(list(map(make_filter(test_sections), uttids)))

    # 3. Random train/test split
    else:
        logger.info("Random train/test split")
        train_ids, valid_ids = train_test_split(uttids, test_size=0.1)
        train_mask = np.array(list(map(lambda s: s in train_ids, uttids)))
        valid_mask = np.array(list(map(lambda s: s in valid_ids, uttids)))

    if options.method in ("10%", "1%"):
        sz = .1 if options.method == "10%" else .01
        train_ids = np.arange(len(features))[train_mask]
        train_ids, _ = train_test_split(train_ids,
                                        train_size=sz,
                                        stratify=labels[train_mask])
        train_ids = set(train_ids)
        train_mask = [idx in train_ids for idx in np.arange(len(features))]

    train_set = SequenceDataset(features[train_mask], labels[train_mask])
    valid_set = SequenceDataset(features[valid_mask], labels[valid_mask])
    # test_set = SequenceDataset(features[test_mask], labels[test_mask])

    logger.info(
        f"Dataset loaded: train_size={len(train_set):,} valid_size={len(valid_set):,}"
    )


    with open(options.outdir/"trainfeats") as featfile, \
            open(options.outdir/"traintasks") as taskfile:
        for uttid in uttids[train_mask]:
            speaker = "_".join(uttid.split("_")[:-1])
            featpath = dataconf.get("speaker")["feats"]
            taskstring = coder.decode(labels[uttid])
            featfile.write(f"{uttid} {featpath}:{uttid}\n")
            taskfile.write(f"{uttid} {taskstring}")

    return train_set, valid_set
Esempio n. 8
0
 def load_structure_and_coder(expdir):
     structure = Structure(expdir / "structure.xml")
     coderconf = read_config(expdir / "coder.cfg")
     Coder = coder_factory(coderconf.get("coder", "name"))
     coder = Coder(structure, coderconf)
     return structure, coder
Esempio n. 9
0

@dataclass
class Task:
    name:str
    args:dict


# Setup
confdir = Path("config/FluentSpeechCommands/lstm_128")
# outdir = Path("exp/fluent/finetune_gru")
outdir = Path("exp/fluent/finetune_gru_enc_upd_2")
dataconf = read_config(confdir/"database.cfg")
coderconf = read_config(confdir/"coder.cfg")
structure = Structure(confdir/"structure.xml")
Coder = coder_factory(coderconf.get("coder", "name"))
coder = Coder(structure, coderconf)

# Model
encoder = torch.load(outdir/"encoder.pt", map_location="cuda")
decoder = torch.load(outdir/"decoder.pt", map_location="cuda")
for module in (encoder, decoder):
    for line in str(module).split("\n"):
        logger.info(line)
    for p in module.parameters():
        p.requires_grad = False

# Target
speakers = list(dataconf.sections())
taskfiles = list(map(lambda spkr: dataconf[spkr].get("tasks"), speakers))
taskstrings = dict(sum(map(load_tasks, taskfiles), []))
Esempio n. 10
0
def evaluate(expdir, cuda=False, clean=False):
    logger.info(f"Evaluate {expdir}")

    acquisitionconf = read_config(expdir / "acquisition.cfg")
    acquisitionconf.set("acquisition", "device", "cuda" if cuda else "cpu")

    coderconf = read_config(expdir / "coder.cfg")
    structure = Structure(expdir / "structure.xml")
    Coder = coder_factory(coderconf.get('coder', 'name'))
    coder = Coder(structure, coderconf)

    Model = model_factory(acquisitionconf.get('acquisition', 'name'))
    model = Model(acquisitionconf, coder, expdir)
    model.display(logger.info)
    model.load(expdir / 'model')
    model.display(logger.info)

    features = FeatLoader(expdir / "testfeats").to_dict()

    with open(expdir / "testtasks") as testtasks:
        references = {
            key: read_task(value)
            for key, value in map(parse_line, testtasks.readlines())
            if key in features
        }

    assert len(features) == len(references), set(features) - set(references)

    to_remove = []
    for uttid, feat in features.items():
        if not np.isfinite(feat).all():
            to_remove.append(uttid)

    if to_remove:
        logger.warning(f"Found {len(to_remove)} utterances with nan.")
        for uttid in to_remove:
            del features[uttid]
            del references[uttid]

    decoded = model.decode(features)

    assert not (set(decoded) - set(references))
    y_true = np.array([coder.encode(task) for task in references.values()])
    y_pred = np.array([coder.encode(task) for task in decoded.values()])

    TP = (y_pred == 1) & (y_true == 1)
    TN = (y_pred == 0) & (y_true == 0)
    FP = (y_pred == 1) & (y_true == 0)
    FN = (y_pred == 0) & (y_true == 1)
    error_rate = 1 - (TP | TN).all(-1).mean()
    precision = TP.sum() / (TP | FP).sum()
    recall = TP.sum() / (TP | FN).sum()
    f1_score = 2 * precision * recall / (precision + recall)
    logger.info(f"TPR={TP.sum()} TN={TN.sum()} FP={FP.sum()} FN={FN.sum()}")
    logger.info(
        f"P={precision:.2%} R={recall:.2%} F={f1_score:.2%} E={error_rate:.2%}"
    )

    for line in classification_report(y_true, y_pred).split("\n"):
        logger.info(line)

    with open(expdir / "dectasks", "w") as dectasks_file:
        dectasks_file.writelines(
            [f"{name} {to_string(task)}\n" for name, task in decoded.items()])

    metrics, scores = score(decoded, references)
    for metric_name, metric in metrics.items():
        logger.info(f"{metric_name}: {metric:.4f}")
        with open(expdir / metric_name.replace(" ", ""), "w") as f:
            f.write(str(metric))

    write_scores(scores, expdir)

    if clean:
        logger.info(f"Remove {expdir}/model")
        os.remove(expdir / 'model')
Esempio n. 11
0
def main(expdir, cuda):

    expdir = Path(expdir)
    if (expdir / "f1").exists():
        logger.info(f"Results found at {expdir}")
        return

    logger.info(f"Evaluate {expdir}")

    acquisitionconf = tools.read_config(expdir / "acquisition.cfg")
    acquisitionconf.set("acquisition", "device", "cuda" if cuda else "cpu")
    coderconf = tools.read_config(expdir / "coder.cfg")
    structure = Structure(expdir / "structure.xml")

    Coder = coder_factory(coderconf.get('coder', 'name'))
    coder = Coder(structure, coderconf)
    Model = model_factory(acquisitionconf.get('acquisition', 'name'))
    model = Model(acquisitionconf, coder, expdir)
    logger.debug(f"Loading model at {expdir}/model")
    model.load(expdir / 'model')

    with open(expdir / "testfeats") as testfeats:
        features = {
            line[0]: np.load(line[1])
            for line in map(tools.parse_line, testfeats.readlines())
        }

    with open(expdir / "testtasks") as testtasks:
        references = {
            key: read_task(value)
            for key, value in map(tools.parse_line, testtasks.readlines())
            if key in features
        }

    assert len(features) == len(references)

    #decode the test utterances
    feats = deepcopy(features)
    errors, nans, too_small = 0, 0, 0
    for uttid, feat in feats.items():
        remove = False
        # if feat.shape[0] < 5:
        #     too_small += 1
        #     remove = True
        if not np.isfinite(feat).all():
            nans += 1
            remove = True
        if remove:
            logger.debug(f"Removing {uttid}")
            errors += 1
            del features[uttid]
            del references[uttid]

    if errors > 0:
        logger.warning(
            f"{errors}/{len(feats)} utts removed ({too_small} too small and {nans} contained NaN)"
        )

    decoded = model.decode(features)

    with open(expdir / "dectasks", "w") as dectasks_file:
        dectasks_file.writelines(
            [f"{name}  {to_string(task)}\n" for name, task in decoded.items()])

    metric_names = [
        "precision", "recal", "f1", "macro precision", "macro recal",
        "macro f1"
    ]
    metrics, scores = score(decoded, references)

    for metric_name, metric in zip(metric_names, metrics):
        logger.info(f"{metric_name}: {metric:.4f}")
        with open(expdir / metric_name.replace(" ", ""), "w") as f:
            f.write(str(metric))

    write_scores(scores, expdir)