def test_split_corpus():
    """ Test corpus splitter """
    n_exemplars = 10
    corpus_dir = "tests/split-corpus"

    orig_dir = pjoin(corpus_dir, "orig")
    split_dir = pjoin(corpus_dir, "splits")
    trn_dir = pjoin(split_dir, "train")
    dev_dir = pjoin(split_dir, "dev")

    setup_test_corpus(orig_dir, trn_dir, dev_dir, n_exemplars)
    orig_corpus = corpus({"location": orig_dir})
    split_corpus(
        orig_dir,
        split_dir=split_dir,
        split_name="dev",
        split_words=19,
        min_split_segs=1,
        leftover_data_split_name="train",
        rand_seed=1337,
    )

    # Make sure we didn't destroy input data
    final_corpus = corpus({"location": orig_dir})
    assert orig_corpus.validate() == 1
    assert final_corpus.validate() == 1
    orig_hashes = [_.hash() for _ in orig_corpus.exemplars]
    final_hashes = [_.hash() for _ in final_corpus.exemplars]
    assert all(h in final_hashes for h in orig_hashes)

    # Make sure correct number of words present in data split
    dev_corpus = corpus({"location": dev_dir})
    assert sum(e.count_words() for e in dev_corpus.exemplars) == 20
    assert dev_corpus.validate()
Ejemplo n.º 2
0
def split_corpus(
    in_dir,
    split_dir,
    split_name="split",
    split_words=1000,
    min_split_segs=10,
    leftover_data_split_name="orig",
    rand_seed=None,
):
    """
    Splits an ASR corpus directory based on number of words outputting splits in split_dir.
    At least 1000 words is recommended for dev or tests splits to make WER calculations significant ~0.1%
    Invalid files, such as empty files, will not be included in data splits.

    Set rand_seed for reproducible splits
    """
    seed(rand_seed)

    c = corpus({"location": in_dir})
    LOGGER.debug("%d exemplars before validating them", len(c.exemplars))
    valid_exemplars, total_words = c.count_exemplar_words()
    c.exemplars = valid_exemplars
    LOGGER.debug("%d exemplars after validating them", len(valid_exemplars))

    if min_split_segs > c.calculate_number_of_segments():
        LOGGER.error(
            "Not enough valid segments in corpus, %d, to make a split with %d segments. Reduce min_split_segs or get more data",
            c.calculate_number_of_segments(),
            min_split_segs,
        )
        sys.exit(1)

    perform_split(c, split_dir, split_name, split_words, min_split_segs,
                  leftover_data_split_name)
def main():
    parser = argparse.ArgumentParser(
        description=
        'Copy and organize specified corpora into a target directory. Training, testing, and development sets will be created automatically if not already defined.'
    )
    parser.add_argument('--target-dir',
                        default='input-data',
                        required=False,
                        help="Path to target directory")
    parser.add_argument(
        'corpora',
        nargs='+',
        help="Name of one or more directories in directory this script is run")

    args = parser.parse_args()

    for data_dir in data_dirs:
        os.makedirs(args.target_dir + "/" + data_dir, exist_ok=True)

    corpora = {
        "dev": corpus(),
        "test": corpus(),
        "train": corpus(),
        "unsorted": corpus()
    }

    for corpus_dir in args.corpora:
        for data_dir in data_dirs:
            if os.path.exists(corpus_dir + "/" + data_dir):
                corpora[data_dir] += corpus(
                    {"location": corpus_dir + "/" + data_dir})
        corpora['unsorted'] += corpus({"location": corpus_dir})

    all_ready = all(corpora[data_dir].validate() for data_dir in data_dirs)
    # dump extra data into training data by default
    corpora['train'] += corpora['unsorted']
    if not all_ready:
        print(
            "Not all training corpora were prepared. Automatically shuffling into training, testing, development sets"
        )

        # first pass, populate train directory
        if not corpora['train'].validate():
            corpora['train'] += corpora['dev'] + corpora['test']

        # pick a file from training set to be dev set
        if not corpora['dev'].validate():
            corpora['dev'] = corpora['train'][-1]
            corpora['train'] = corpora['train'][:-1]

        # pick 20% for testing
        if not corpora['test'].validate():
            split_index = int(corpora['train'].validate() * 4 // 5)
            corpora['test'] = corpora['train'][split_index:]
            corpora['train'] = corpora['train'][:split_index]

    # prepare for training
    for data_dir in data_dirs:
        corpora[data_dir].prepare_for_training(args.target_dir + "/" +
                                               data_dir)
Ejemplo n.º 4
0
def gather_all_corpora(corpora_dirs):
    """
    Finds all existing corpora and gathers into a dictionary
    """

    corpora = {
        data_dir: get_corpus(corpus_dir + "/" + data_dir)
        for corpus_dir in corpora_dirs for data_dir in data_dirs
    }

    corpora["unsorted"] = corpus()
    for unsorted_corpus in list(map(get_corpus, corpora_dirs)):
        corpora["unsorted"] += unsorted_corpus
    return corpora
Ejemplo n.º 5
0
def get_corpus(loc):
    """ returns corpus for input location """
    return corpus({"location": loc})