def test_create_partition_forced(self): st = create_setting() # cannot reuse create_sample_partition because we need pgc_files # for asserting pgc_files = relglob(st.corpus_dir, "news/pgc/ma/2006-11/*.pgc") self.assertTrue(pgc_files) forced_fns = ['news/pgc/ma/2006-11/news-2006-11-aligned-part-02.pgc'] corpus_fns, corpus_sizes, dev_parts, val_parts, dev_sizes, val_sizes = \ create_partition(pgc_files, corpus_dir=st.corpus_dir, dev_bins=4, val_bins=1, forced_fns=forced_fns) write_partition(corpus_fns, corpus_sizes, dev_parts, val_parts, dev_sizes, val_sizes) self.assertEqual(len(dev_parts), 4) self.assertEqual(len(val_parts), 1 + 1) dev_fns = set( part_fname for part_list in dev_parts for part_fname in part_list ) val_fns = set( part_fname for part_list in val_parts for part_fname in part_list ) # check if no files were lost self.assertEqual( len(dev_fns) + len(val_fns), len(pgc_files) ) # check for overlap self.assertTrue(dev_fns.isdisjoint(val_fns)) # check for forced files for forced_fname in forced_fns: self.assertTrue(forced_fname in val_fns)
def create_sample_partition(setting): """ Create a sample partition and save as ./partition.py This assumes a sample corpus under "corpus" subdir """ corpus_fnames = relglob(setting.corpus_dir, "news/pgc/ma/2006-11/*.pgc") partition = create_partition(corpus_fnames, corpus_dir=setting.corpus_dir, dev_bins=4, val_bins=1) write_partition(*partition, out="partition.py")
help="number of bins used for development data (default is 10)") parser.add_argument( "-f", "--force", nargs="*", default=[], metavar="FILE", help="force parallel graph corpus file into extra validation bin") parser.add_argument( "-o", "--val-bins", type=int, default=2, help="number of bins used for validation data (default is 2)") args = parser.parse_args() pgc_fns = expand_globs(args.corpus_dir, args.pgc_glob) partition = create_partition(pgc_fns, corpus_dir=args.corpus_dir, dev_bins=args.dev_bins, val_bins=args.val_bins, forced_fns=args.force) write_partition(*partition)
type=int, default=10, help="number of bins used for development data (default is 10)") parser.add_argument( "-f", "--force", nargs="*", default=[], metavar="FILE", help="force parallel graph corpus file into extra validation bin") parser.add_argument( "-o", "--val-bins", type=int, default=2, help="number of bins used for validation data (default is 2)") args = parser.parse_args() pgc_fns = expand_globs(args.corpus_dir, args.pgc_glob) partition = create_partition(pgc_fns, corpus_dir=args.corpus_dir, dev_bins=args.dev_bins, val_bins=args.val_bins, forced_fns=args.force) write_partition(*partition)