def test_invalid_partition(self): self.assertRaises( ValueError, PartitionedData.from_text, { "train": ["red red red", "blue blue green"], "validate": ["red blue blue orange"], "test": ["green green red black"] }, ["bogus"], Vocabulary.factory(WordTokenizer(True)))
def create_partitioned_data(): return PartitionedData.from_text( { "train": ["blue blue green", "red red red"], "test": ["green green red black"], "validate": ["red blue blue orange"] }, ["train"], Vocabulary.factory(WordTokenizer(True)))
def create_data_set(args): if args.vocabulary_partitions is None: args.vocabulary_partitions = args.partitions.keys() tokenizer = {"whitespace": WhitespaceTokenizer, "word": WordTokenizer, "character": CharacterTokenizer}[args.tokenizer](args.case_normalized) factory = Vocabulary.factory(tokenizer, min_frequency=args.min_frequency, max_vocabulary=args.max_vocabulary, out_of_vocabulary=args.out_of_vocabulary) partitions = {} for partition, filenames in args.partitions.items(): partitions[partition] = [open(filename).read() for filename in filenames] partitioned_data = PartitionedData.from_text(partitions, args.vocabulary_partitions, factory) partitioned_data.serialize(args.directory) logger.info("Created %s in %s" % (partitioned_data, args.directory))
def create_data_set(args): if args.vocabulary_partitions is None: args.vocabulary_partitions = args.partitions.keys() tokenizer = { "whitespace": WhitespaceTokenizer, "word": WordTokenizer, "character": CharacterTokenizer }[args.tokenizer](args.case_normalized) factory = Vocabulary.factory(tokenizer, min_frequency=args.min_frequency, max_vocabulary=args.max_vocabulary, out_of_vocabulary=args.out_of_vocabulary) partitions = {} for partition, filenames in args.partitions.items(): partitions[partition] = [ open(filename).read() for filename in filenames ] partitioned_data = PartitionedData.from_text(partitions, args.vocabulary_partitions, factory) partitioned_data.serialize(args.directory) logger.info("Created %s in %s" % (partitioned_data, args.directory))
def create_partitioned_data(): return PartitionedData.from_text({ "train": ["blue blue green", "red red red"], "test": ["green green red black"], "validate": ["red blue blue orange"] }, ["train"], Vocabulary.factory(WordTokenizer(True)))
def test_vocabulary_factory(self): factory = Vocabulary.factory(WordTokenizer(True), max_vocabulary=2) self.assertEqual( Vocabulary(["to be or not to be"], WordTokenizer(True), max_vocabulary=2), factory(["to be or not to be"]))
def test_vocabulary_factory(self): factory = Vocabulary.factory(WordTokenizer(True), max_vocabulary=2) self.assertEqual(Vocabulary(["to be or not to be"], WordTokenizer(True), max_vocabulary=2), factory(["to be or not to be"]))
def test_invalid_partition(self): self.assertRaises(ValueError, PartitionedData.from_text, { "train": ["red red red", "blue blue green"], "validate": ["red blue blue orange"], "test": ["green green red black"] }, ["bogus"], Vocabulary.factory(WordTokenizer(True)))