def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "data", nargs="+", help="Source text file.") parser.add_argument( "--save_vocab", required=True, help="Output vocabulary file.") parser.add_argument( "--min_frequency", type=int, default=1, help="Minimum word frequency.") parser.add_argument( "--size", type=int, default=0, help="Maximum vocabulary size. If = 0, do not limit vocabulary.") parser.add_argument( "--without_sequence_tokens", default=False, action="store_true", help="If set, do not add special sequence tokens (start, end) in the vocabulary.") tokenizers.add_command_line_arguments(parser) args = parser.parse_args() tokenizer = tokenizers.build_tokenizer(args) special_tokens = [constants.PADDING_TOKEN] if not args.without_sequence_tokens: special_tokens.append(constants.START_OF_SENTENCE_TOKEN) special_tokens.append(constants.END_OF_SENTENCE_TOKEN) vocab = utils.Vocab(special_tokens=special_tokens) for data_file in args.data: vocab.add_from_text(data_file, tokenizer=tokenizer) vocab = vocab.prune(max_size=args.size, min_frequency=args.min_frequency) vocab.serialize(args.save_vocab)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("data", nargs="*", help="Source text file.") parser.add_argument( "--from_vocab", default=None, help="Build from a saved vocabulary (see also --from_format).") parser.add_argument( "--from_format", default="default", choices=["default", "sentencepiece"], help="The format of the saved vocabulary (see also --from_vocab).") parser.add_argument("--save_vocab", required=True, help="Output vocabulary file.") parser.add_argument("--min_frequency", type=int, default=1, help="Minimum word frequency.") parser.add_argument( "--size", type=int, default=0, help="Maximum vocabulary size. If = 0, do not limit vocabulary.") parser.add_argument( "--size_multiple", type=int, default=1, help= ("Ensure that the vocabulary size + 1 is a multiple of this value " "(+ 1 represents the <unk> token that will be added during the training." )) parser.add_argument( "--without_sequence_tokens", default=False, action="store_true", help= "If set, do not add special sequence tokens (start, end) in the vocabulary." ) tokenizers.add_command_line_arguments(parser) args = parser.parse_args() tokenizer = tokenizers.build_tokenizer(args) special_tokens = [constants.PADDING_TOKEN] if not args.without_sequence_tokens: special_tokens.append(constants.START_OF_SENTENCE_TOKEN) special_tokens.append(constants.END_OF_SENTENCE_TOKEN) vocab = utils.Vocab(special_tokens=special_tokens, from_file=args.from_vocab, from_format=args.from_format) for data_file in args.data: vocab.add_from_text(data_file, tokenizer=tokenizer) vocab = vocab.prune(max_size=args.size, min_frequency=args.min_frequency) vocab.pad_to_multiple(args.size_multiple, num_oov_buckets=1) vocab.serialize(args.save_vocab)
def build_vocab_from_file(src_file, save_path, min_frequency=5, size=0, without_sequence_tokens=False): """ Generate word vocabularies from monolingual corpus. :param src_file: Source text file. :param save_path: Output vocabulary file. :param min_frequency: Minimum word frequency. # for yelp and amazon, min_frequency=5 :param size: Maximum vocabulary size. If = 0, do not limit vocabulary. :param without_sequence_tokens: If set, do not add special sequence tokens (start, end) in the vocabulary. :return: No return. """ special_tokens = [constants.PADDING_TOKEN] if not without_sequence_tokens: special_tokens.append(constants.START_OF_SENTENCE_TOKEN) special_tokens.append(constants.END_OF_SENTENCE_TOKEN) vocab = utils.Vocab(special_tokens=special_tokens) if isinstance(src_file, list): for data_file in src_file: vocab.add_from_text(data_file) else: vocab.add_from_text(src_file) vocab = vocab.prune(max_size=size, min_frequency=min_frequency) vocab.serialize(save_path)