Ejemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        "data", nargs="+",
        help="Source text file.")
    parser.add_argument(
        "--save_vocab", required=True,
        help="Output vocabulary file.")
    parser.add_argument(
        "--min_frequency", type=int, default=1,
        help="Minimum word frequency.")
    parser.add_argument(
        "--size", type=int, default=0,
        help="Maximum vocabulary size. If = 0, do not limit vocabulary.")
    parser.add_argument(
        "--without_sequence_tokens", default=False, action="store_true",
        help="If set, do not add special sequence tokens (start, end) in the vocabulary.")
    tokenizers.add_command_line_arguments(parser)
    args = parser.parse_args()

    tokenizer = tokenizers.build_tokenizer(args)

    special_tokens = [constants.PADDING_TOKEN]
    if not args.without_sequence_tokens:
        special_tokens.append(constants.START_OF_SENTENCE_TOKEN)
        special_tokens.append(constants.END_OF_SENTENCE_TOKEN)

    vocab = utils.Vocab(special_tokens=special_tokens)
    for data_file in args.data:
        vocab.add_from_text(data_file, tokenizer=tokenizer)
    vocab = vocab.prune(max_size=args.size, min_frequency=args.min_frequency)
    vocab.serialize(args.save_vocab)
Ejemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("data", nargs="*", help="Source text file.")
    parser.add_argument(
        "--from_vocab",
        default=None,
        help="Build from a saved vocabulary (see also --from_format).")
    parser.add_argument(
        "--from_format",
        default="default",
        choices=["default", "sentencepiece"],
        help="The format of the saved vocabulary (see also --from_vocab).")
    parser.add_argument("--save_vocab",
                        required=True,
                        help="Output vocabulary file.")
    parser.add_argument("--min_frequency",
                        type=int,
                        default=1,
                        help="Minimum word frequency.")
    parser.add_argument(
        "--size",
        type=int,
        default=0,
        help="Maximum vocabulary size. If = 0, do not limit vocabulary.")
    parser.add_argument(
        "--size_multiple",
        type=int,
        default=1,
        help=
        ("Ensure that the vocabulary size + 1 is a multiple of this value "
         "(+ 1 represents the <unk> token that will be added during the training."
         ))
    parser.add_argument(
        "--without_sequence_tokens",
        default=False,
        action="store_true",
        help=
        "If set, do not add special sequence tokens (start, end) in the vocabulary."
    )
    tokenizers.add_command_line_arguments(parser)
    args = parser.parse_args()

    tokenizer = tokenizers.build_tokenizer(args)

    special_tokens = [constants.PADDING_TOKEN]
    if not args.without_sequence_tokens:
        special_tokens.append(constants.START_OF_SENTENCE_TOKEN)
        special_tokens.append(constants.END_OF_SENTENCE_TOKEN)

    vocab = utils.Vocab(special_tokens=special_tokens,
                        from_file=args.from_vocab,
                        from_format=args.from_format)
    for data_file in args.data:
        vocab.add_from_text(data_file, tokenizer=tokenizer)
    vocab = vocab.prune(max_size=args.size, min_frequency=args.min_frequency)
    vocab.pad_to_multiple(args.size_multiple, num_oov_buckets=1)
    vocab.serialize(args.save_vocab)
Ejemplo n.º 3
0
def build_vocab_from_file(src_file, save_path, min_frequency=5, size=0, without_sequence_tokens=False):
    """
    Generate word vocabularies from monolingual corpus.
    :param src_file: Source text file.
    :param save_path: Output vocabulary file.
    :param min_frequency: Minimum word frequency.  # for yelp and amazon, min_frequency=5
    :param size: Maximum vocabulary size. If = 0, do not limit vocabulary.
    :param without_sequence_tokens: If set, do not add special sequence tokens (start, end) in the vocabulary.
    :return: No return.
    """

    special_tokens = [constants.PADDING_TOKEN]
    if not without_sequence_tokens:
        special_tokens.append(constants.START_OF_SENTENCE_TOKEN)
        special_tokens.append(constants.END_OF_SENTENCE_TOKEN)

    vocab = utils.Vocab(special_tokens=special_tokens)
    if isinstance(src_file, list):
        for data_file in src_file:
            vocab.add_from_text(data_file)
    else:
        vocab.add_from_text(src_file)
    vocab = vocab.prune(max_size=size, min_frequency=min_frequency)
    vocab.serialize(save_path)