def test_split_corpus(): """ Test corpus splitter """ n_exemplars = 10 corpus_dir = "tests/split-corpus" orig_dir = pjoin(corpus_dir, "orig") split_dir = pjoin(corpus_dir, "splits") trn_dir = pjoin(split_dir, "train") dev_dir = pjoin(split_dir, "dev") setup_test_corpus(orig_dir, trn_dir, dev_dir, n_exemplars) orig_corpus = corpus({"location": orig_dir}) split_corpus( orig_dir, split_dir=split_dir, split_name="dev", split_words=19, min_split_segs=1, leftover_data_split_name="train", rand_seed=1337, ) # Make sure we didn't destroy input data final_corpus = corpus({"location": orig_dir}) assert orig_corpus.validate() == 1 assert final_corpus.validate() == 1 orig_hashes = [_.hash() for _ in orig_corpus.exemplars] final_hashes = [_.hash() for _ in final_corpus.exemplars] assert all(h in final_hashes for h in orig_hashes) # Make sure correct number of words present in data split dev_corpus = corpus({"location": dev_dir}) assert sum(e.count_words() for e in dev_corpus.exemplars) == 20 assert dev_corpus.validate()
def split_corpus( in_dir, split_dir, split_name="split", split_words=1000, min_split_segs=10, leftover_data_split_name="orig", rand_seed=None, ): """ Splits an ASR corpus directory based on number of words outputting splits in split_dir. At least 1000 words is recommended for dev or tests splits to make WER calculations significant ~0.1% Invalid files, such as empty files, will not be included in data splits. Set rand_seed for reproducible splits """ seed(rand_seed) c = corpus({"location": in_dir}) LOGGER.debug("%d exemplars before validating them", len(c.exemplars)) valid_exemplars, total_words = c.count_exemplar_words() c.exemplars = valid_exemplars LOGGER.debug("%d exemplars after validating them", len(valid_exemplars)) if min_split_segs > c.calculate_number_of_segments(): LOGGER.error( "Not enough valid segments in corpus, %d, to make a split with %d segments. Reduce min_split_segs or get more data", c.calculate_number_of_segments(), min_split_segs, ) sys.exit(1) perform_split(c, split_dir, split_name, split_words, min_split_segs, leftover_data_split_name)
def main(): parser = argparse.ArgumentParser( description= 'Copy and organize specified corpora into a target directory. Training, testing, and development sets will be created automatically if not already defined.' ) parser.add_argument('--target-dir', default='input-data', required=False, help="Path to target directory") parser.add_argument( 'corpora', nargs='+', help="Name of one or more directories in directory this script is run") args = parser.parse_args() for data_dir in data_dirs: os.makedirs(args.target_dir + "/" + data_dir, exist_ok=True) corpora = { "dev": corpus(), "test": corpus(), "train": corpus(), "unsorted": corpus() } for corpus_dir in args.corpora: for data_dir in data_dirs: if os.path.exists(corpus_dir + "/" + data_dir): corpora[data_dir] += corpus( {"location": corpus_dir + "/" + data_dir}) corpora['unsorted'] += corpus({"location": corpus_dir}) all_ready = all(corpora[data_dir].validate() for data_dir in data_dirs) # dump extra data into training data by default corpora['train'] += corpora['unsorted'] if not all_ready: print( "Not all training corpora were prepared. Automatically shuffling into training, testing, development sets" ) # first pass, populate train directory if not corpora['train'].validate(): corpora['train'] += corpora['dev'] + corpora['test'] # pick a file from training set to be dev set if not corpora['dev'].validate(): corpora['dev'] = corpora['train'][-1] corpora['train'] = corpora['train'][:-1] # pick 20% for testing if not corpora['test'].validate(): split_index = int(corpora['train'].validate() * 4 // 5) corpora['test'] = corpora['train'][split_index:] corpora['train'] = corpora['train'][:split_index] # prepare for training for data_dir in data_dirs: corpora[data_dir].prepare_for_training(args.target_dir + "/" + data_dir)
def gather_all_corpora(corpora_dirs): """ Finds all existing corpora and gathers into a dictionary """ corpora = { data_dir: get_corpus(corpus_dir + "/" + data_dir) for corpus_dir in corpora_dirs for data_dir in data_dirs } corpora["unsorted"] = corpus() for unsorted_corpus in list(map(get_corpus, corpora_dirs)): corpora["unsorted"] += unsorted_corpus return corpora
def get_corpus(loc): """ returns corpus for input location """ return corpus({"location": loc})