Exemple #1
0
def main(argv=None):
    """ThaiNLP command line."""
    if not argv:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        prog="thainlp",
        description="Thai natural language processing.",
        usage=("thainlp <command> [options]\n\n"
               "Example:\n\n"
               "thainlp data catalog\n\n"
               "--"),
    )
    parser.add_argument(
        "command",
        type=str,
        choices=cli.COMMANDS,
        help="text processing action",
    )

    args = parser.parse_args(argv[1:2])
    cli.exit_if_empty(args.command, parser)

    if hasattr(cli, args.command):
        command = getattr(cli, args.command)
        command.App(argv)
Exemple #2
0
def main(args=None):
    """The main routine of PyThaiNLP command line."""
    if args is None:
        args = sys.argv[1:]

    parser = argparse.ArgumentParser(
        usage="pythainlp command [subcommand] [options]")

    parser.add_argument("command",
                        type=str,
                        default="",
                        nargs="?",
                        help="[%s]" % "|".join(cli.COMMANDS))

    args = parser.parse_args(sys.argv[1:2])

    cli.exit_if_empty(args.command, parser)

    if hasattr(cli, args.command):
        command = getattr(cli, args.command)
        command.App(sys.argv)
    else:
        print(
            f"Command not available: {args.command}\nPlease run with --help for alternatives"
        )
Exemple #3
0
    def __init__(self, argv):
        parser = argparse.ArgumentParser(**cli.make_usage("tokenize"))
        parser.add_argument("subcommand",
                            type=str,
                            nargs="?",
                            help="[subword|syllable|word|sent]")

        args = parser.parse_args(argv[2:3])

        cli.exit_if_empty(args.subcommand, parser)
        subcommand = str.lower(args.subcommand)

        argv = argv[3:]

        if subcommand.startswith("word"):
            WordTokenizationApp("word", argv)
        elif subcommand.startswith("syl"):
            SyllableTokenizationApp("syllable", argv)
        elif subcommand.startswith("subw"):
            SubwordTokenizationApp("subword", argv)
        elif subcommand.startswith("sent"):
            SubwordTokenizationApp("sent", argv)
        else:
            raise NotImplementedError(
                f"Subcommand not available: {subcommand}")
Exemple #4
0
    def __init__(self, argv):
        parser = argparse.ArgumentParser(**cli.make_usage("corpus"))

        parser.add_argument(
            "subcommand",
            type=str,
            default="",
            nargs="?",
            help="[download|remove]"  # there should be a "list" subcommand
        )

        parser.add_argument(
            "--name",
            type=str,
            help="corpus's name",
        )

        args = parser.parse_args(argv[2:])

        cli.exit_if_empty(args.subcommand, parser)
        subcommand = str.lower(args.subcommand)

        if hasattr(App, subcommand):
            getattr(App, subcommand)(args)
        else:
            raise NotImplementedError(
                f"Subcommand not available: {subcommand}")
Exemple #5
0
    def __init__(self, name, argv):
        parser = argparse.ArgumentParser(**cli.make_usage("tokenize " + name))
        parser.add_argument(
            "-t",
            "--text",
            type=str,
            help="input text",
        )

        parser.add_argument("-s",
                            "--sep",
                            type=bool,
                            help=f"default: {self.separator}",
                            default=self.separator)

        parser.add_argument("-e",
                            "--engine",
                            type=str,
                            help=f"default: {self.engine}",
                            default=self.engine)

        parser.add_argument("-w",
                            "--keep-whitespace",
                            type=bool,
                            help=f"default: {self.keep_whitespace}",
                            default=self.keep_whitespace)

        args = parser.parse_args(argv)

        self.args = args

        cli.exit_if_empty(args.text, parser)

        result = self.run(args.text, engine=args.engine)
        print(self.separator.join(result))
Exemple #6
0
    def __init__(self, name, argv):
        parser = argparse.ArgumentParser(
            **cli.make_usage("tokenization " + name)
        )
        parser.add_argument(
            "--text",
            type=str,
            help="input text",
        )

        parser.add_argument(
            "--engine",
            type=str,
            help="default: %s" % self.default_engine,
            default=self.default_engine
        )

        args = parser.parse_args(argv)

        self.args = args

        cli.exit_if_empty(args.text, parser)

        print(f"Using engine={args.engine}")
        result = self.run(args.text, engine=args.engine)
        print(self.separator.join(result))
Exemple #7
0
    def __init__(self, name, argv):
        parser = argparse.ArgumentParser(**cli.make_usage("tokenize " + name))
        parser.add_argument(
            "text",
            type=str,
            nargs="?",
            help="input text",
        )
        parser.add_argument(
            "-s",
            "--sep",
            dest="separator",
            type=str,
            help=f"default: {self.separator}",
            default=self.separator,
        )
        parser.add_argument(
            "-a",
            "--algo",
            dest="algorithm",
            type=str,
            help=f"default: {self.algorithm}",
            default=self.algorithm,
        )
        parser.add_argument(
            "-w",
            "--keep-whitespace",
            dest="keep_whitespace",
            action="store_true",
        )
        parser.add_argument(
            "-nw",
            "--no-whitespace",
            dest="keep_whitespace",
            action="store_false",
        )
        parser.set_defaults(keep_whitespace=True)

        args = parser.parse_args(argv)
        self.args = args

        cli.exit_if_empty(args.text, parser)
        result = self.run(
            args.text,
            engine=args.algorithm,
            keep_whitespace=args.keep_whitespace,
        )
        print(args.separator.join(result) + args.separator)
Exemple #8
0
    def __init__(self, argv):
        parser = argparse.ArgumentParser(**cli.make_usage("tag"))
        parser.add_argument("subcommand", type=str, nargs="?", help="[pos]")

        args = parser.parse_args(argv[2:3])

        cli.exit_if_empty(args.subcommand, parser)
        subcommand = str.lower(args.subcommand)

        argv = argv[3:]

        if subcommand == "pos":
            POSTaggingApp("Part-of-Speech tagging", argv)
        else:
            raise NotImplementedError(
                f"Subcommand not available: {subcommand}")
Exemple #9
0
    def __init__(self, argv):
        parser = argparse.ArgumentParser(
            prog="tokenize",
            description="Break a text into small units (tokens).",
            usage=(
                'thainlp tokenize <token_type> [options] "<text>"\n\n'
                "token_type:\n\n"
                "subword            subword (may not be a linguistic unit)\n"
                "syllable           syllable\n"
                "word               word\n"
                "sent               sentence\n\n"
                "options:\n\n"
                "--sep or -s <separator>    specify custom separator\n"
                "                           (default is a space)\n"
                "--algo or -a <algorithm>   tokenization algorithm\n"
                "                           (see API doc for more info)\n"
                "--keep-whitespace or -w    keep whitespaces in output\n"
                "                           (default)\n\n"
                "<separator> and <text> should be inside double quotes.\n\n"
                "Example:\n\n"
                'thainlp tokenize word -s "|" "ใต้แสงนีออนเปลี่ยวเหงา"\n\n'
                "--"),
        )
        parser.add_argument(
            "token_type",
            type=str,
            help="[subword|syllable|word|sent]",
        )

        args = parser.parse_args(argv[2:3])
        cli.exit_if_empty(args.token_type, parser)
        token_type = str.lower(args.token_type)

        argv = argv[3:]
        if token_type.startswith("w"):
            WordTokenizationApp("word", argv)
        elif token_type.startswith("sy"):
            SyllableTokenizationApp("syllable", argv)
        elif token_type.startswith("su"):
            SubwordTokenizationApp("subword", argv)
        elif token_type.startswith("se"):
            SentenceTokenizationApp("sent", argv)
        else:
            print(f"Token type not available: {token_type}")
Exemple #10
0
    def __init__(self, argv):
        parser = argparse.ArgumentParser("soundex")
        parser.add_argument(
            "--text",
            type=str,
            help="text",
        )

        parser.add_argument("--engine",
                            type=str,
                            help="[udom83|lk82|metasound] (default: udom83)",
                            default="udom83")

        args = parser.parse_args(argv[2:])

        cli.exit_if_empty(args.text, parser)

        sx = soundex(args.text, engine=args.engine)
        print(sx)
Exemple #11
0
    def __init__(self, argv):
        parser = argparse.ArgumentParser(
            prog="benchmark",
            description=("Benchmark for various tasks;\n"
                         "currently, we have only for word tokenization."),
            usage=("thainlp benchmark [task] [task-options]\n\n"
                   "tasks:\n\n"
                   "word-tokenization      benchmark word tokenization\n\n"
                   "--"),
        )

        parser.add_argument("task", type=str, help="[word-tokenization]")

        args = parser.parse_args(argv[2:3])
        cli.exit_if_empty(args.task, parser)
        task = str.lower(args.task)

        task_argv = argv[3:]
        if task == "word-tokenization":
            WordTokenizationBenchmark(task, task_argv)
Exemple #12
0
    def __init__(self, argv):
        parser = argparse.ArgumentParser(**cli.make_usage("tagging"))
        parser.add_argument(
            "command",
            type=str,
            nargs="?",
            help="[pos]"
        )

        args = parser.parse_args(argv[2:3])
        command = args.command

        cli.exit_if_empty(args.command, parser)

        argv = argv[3:]

        if command == "pos":
            POSTaggingApp("Part-of-Speech tagging", argv)
        else:
            raise ValueError(f"no command:{subcommand}")
Exemple #13
0
    def __init__(self, argv):
        parser = argparse.ArgumentParser(**cli.make_usage("tokenization"))
        parser.add_argument(
            "command",
            type=str,
            nargs="?",
            help="[word|syllable]"
        )

        args = parser.parse_args(argv[2:3])
        command = args.command

        cli.exit_if_empty(command, parser)

        argv = argv[3:]

        if command == "word":
            WordTokenizationApp("word", argv)
        elif command == "syllable":
            SyllableTokenizationApp("syllable", argv)
Exemple #14
0
    def __init__(self, argv):
        parser = argparse.ArgumentParser("soundex")

        parser.add_argument(
            "subcommand",
            type=str,
            nargs="?",
            help="[udom83|lk82|metasound]"
        )

        parser.add_argument(
            "-t",
            "--text",
            type=str,
            help="input text",
        )

        args = parser.parse_args(argv[2:3])

        cli.exit_if_empty(args.subcommand, parser)
        subcommand = str.lower(args.subcommand)

        cli.exit_if_empty(args.text, parser)

        sdx = ""
        if subcommand.startswith("udom"):
            sdx = soundex(args.text, engine="udom83")
        elif subcommand.startswith("lk"):
            sdx = soundex(args.text, engine="lk82")
        elif subcommand.startswith("meta"):
            sdx = soundex(args.text, engine="metasound")
        else:
            raise NotImplementedError(
                f"Subcommand not available: {subcommand}")

        print(sdx)
Exemple #15
0
    def __init__(self, argv):
        parser = argparse.ArgumentParser(**cli.make_usage("corpus"))

        parser.add_argument(
            "--name",
            type=str,
            help="corpus's name",
        )

        parser.add_argument("command",
                            type=str,
                            default="",
                            nargs="?",
                            help="[download|remove]")

        args = parser.parse_args(argv[2:])

        cli.exit_if_empty(args.command, parser)
        command = args.command

        if hasattr(App, command):
            getattr(App, command)(args)
        else:
            print("No command available: %s" % command)