Ejemplo n.º 1
0
 def test_create_partition_forced(self):
     st = create_setting()
     # cannot reuse create_sample_partition because we need pgc_files 
     # for asserting
     pgc_files = relglob(st.corpus_dir, "news/pgc/ma/2006-11/*.pgc")
     self.assertTrue(pgc_files)
     
     forced_fns = ['news/pgc/ma/2006-11/news-2006-11-aligned-part-02.pgc']
     
     corpus_fns, corpus_sizes, dev_parts, val_parts, dev_sizes, val_sizes = \
     create_partition(pgc_files, corpus_dir=st.corpus_dir, 
                      dev_bins=4, val_bins=1, forced_fns=forced_fns)
     
     write_partition(corpus_fns, corpus_sizes, dev_parts, val_parts,
                     dev_sizes, val_sizes)
     
     self.assertEqual(len(dev_parts), 4)
     self.assertEqual(len(val_parts), 1 + 1)
     
     dev_fns = set( part_fname for part_list in dev_parts 
                    for part_fname in part_list )
     val_fns = set( part_fname for part_list in val_parts 
                    for part_fname in part_list )
     
     # check if no files were lost
     self.assertEqual( len(dev_fns) + len(val_fns), 
                       len(pgc_files) )
     
     # check for overlap
     self.assertTrue(dev_fns.isdisjoint(val_fns))
     
     # check for forced files
     for forced_fname in forced_fns:
         self.assertTrue(forced_fname in val_fns)
Ejemplo n.º 2
0
def create_sample_partition(setting):
    """
    Create a sample partition and save as ./partition.py
    
    This assumes a sample corpus under "corpus" subdir 
    """
    corpus_fnames = relglob(setting.corpus_dir, "news/pgc/ma/2006-11/*.pgc")
    partition = create_partition(corpus_fnames, corpus_dir=setting.corpus_dir,
                                 dev_bins=4, val_bins=1)
    write_partition(*partition, out="partition.py")
Ejemplo n.º 3
0
    type=int,
    default=10,
    help="number of bins used for development data (default is 10)")

parser.add_argument(
    "-f",
    "--force",
    nargs="*",
    default=[],
    metavar="FILE",
    help="force parallel graph corpus file into extra validation bin")

parser.add_argument(
    "-o",
    "--val-bins",
    type=int,
    default=2,
    help="number of bins used for validation data (default is 2)")

args = parser.parse_args()

pgc_fns = expand_globs(args.corpus_dir, args.pgc_glob)

partition = create_partition(pgc_fns,
                             corpus_dir=args.corpus_dir,
                             dev_bins=args.dev_bins,
                             val_bins=args.val_bins,
                             forced_fns=args.force)

write_partition(*partition)