Ejemplo n.º 1
0
    def __init__(self,
                 do_lower_case=False,
                 never_split=None,
                 normalize_text=True,
                 mecab_option: Optional[str] = None):
        """Constructs a MecabTokenizer.

        Args:
            **do_lower_case**: (`optional`) boolean (default True)
                Whether to lower case the input.
            **never_split**: (`optional`) list of str
                Kept for backward compatibility purposes.
                Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
                List of token not to split.
            **normalize_text**: (`optional`) boolean (default True)
                Whether to apply unicode normalization to text before tokenization.
            **mecab_option**: (`optional`) string passed to `MeCab.Tagger` constructor (default "")
        """
        self.do_lower_case = do_lower_case
        self.never_split = never_split if never_split is not None else []
        self.normalize_text = normalize_text

        import fugashi
        import ipadic

        # Use ipadic by default (later options can override it)
        mecab_option = mecab_option or ""
        mecab_option = ipadic.MECAB_ARGS + " " + mecab_option

        self.mecab = fugashi.GenericTagger(mecab_option)
Ejemplo n.º 2
0
    def __init__(self, mecab_option=None):
        import fugashi
        if mecab_option is None:
            import unidic_lite
            dic_dir = unidic_lite.DICDIR
            mecabrc = os.path.join(dic_dir, "mecabrc")
            mecab_option = "-d {} -r {}".format(dic_dir, mecabrc)

        self.mecab = fugashi.GenericTagger(mecab_option)
def tokenize(lines):
    tagger = fugashi.GenericTagger(ipadic.MECAB_ARGS + ' -Owakati')
    #lines_tokenized = []
    for i, line in enumerate(lines):
        lines[i] = tagger(line) # tagger(line) is a list of fugashi nodes
        print(lines[i])
        #lines_tokenized.append(token.surface for token in tagger(line))
    #print(lines_tokenized)
    return lines
Ejemplo n.º 4
0
def main():
    args = parse_args()

    if args.unidic:
        dicdir = args.dicdir or unidic_lite.DICDIR
        rcfile = os.path.join(dicdir, 'mecabrc')
        tokenizer = unidic_tokenize
    else:
        dicdir = args.dicdir or '/var/lib/mecab/dic/juman-utf8'
        rcfile = args.rcfile or '/etc/mecabrc'
        tokenizer = juman_tokenize
    assert dicdir and rcfile

    global tagger
    tagger = fugashi.GenericTagger(f'-r {rcfile} -d {dicdir}')
    charset = tagger.dictionary_info[0]['charset']
    assert charset == 'utf-8' or charset == 'utf8'

    dataset = []
    with gzip.open(args.rcqafile, "rt", encoding="utf-8") as fp:
        for line in fp:
            data = json.loads(line)
            if data["documents"]:
                dataset.append(data)

    train_dataset = [data for data in dataset if data["timestamp"] < "2009"]
    dev_dataset = [
        data for data in dataset if "2009" <= data["timestamp"] < "2010"
    ]
    test_dataset = [data for data in dataset if "2010" <= data["timestamp"]]

    for filename, datasplit in (("rcqa_train.json", train_dataset),
                                ("rcqa_dev.json", dev_dataset),
                                ("rcqa_test.json", test_dataset)):
        entries = convert(datasplit, tokenizer, args.oldformat)
        with open(filename, "w", encoding="utf-8") as fp:
            json.dump({"data": entries}, fp, ensure_ascii=False)
Ejemplo n.º 5
0
    def __init__(self,
                 mecab_dic: Optional[str] = None,
                 mecab_option: Optional[str] = None) -> None:
        import fugashi
        mecab_option = mecab_option or ""

        if mecab_dic is not None:
            if mecab_dic == "unidic_lite":
                import unidic_lite
                dic_dir = unidic_lite.DICDIR
            elif mecab_dic == "unidic":
                import unidic
                dic_dir = unidic.DICDIR
            elif mecab_dic == "ipadic":
                import ipadic
                dic_dir = ipadic.DICDIR
            else:
                raise ValueError("Invalid mecab_dic is specified.")

            mecabrc = os.path.join(dic_dir, "mecabrc")
            mecab_option = "-d {} -r {} ".format(dic_dir,
                                                 mecabrc) + mecab_option

        self.mecab = fugashi.GenericTagger(mecab_option)
    def __init__(
        self,
        do_lower_case=False,
        never_split=None,
        normalize_text=True,
        mecab_dic: Optional[str] = "ipadic",
        mecab_option: Optional[str] = None,
    ):
        """
        Constructs a MecabTokenizer.

        Args:
            **do_lower_case**: (`optional`) boolean (default True)
                Whether to lowercase the input.
            **never_split**: (`optional`) list of str
                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
                :func:`PreTrainedTokenizer.tokenize`) List of tokens not to split.
            **normalize_text**: (`optional`) boolean (default True)
                Whether to apply unicode normalization to text before tokenization.
            **mecab_dic**: (`optional`) string (default "ipadic")
                Name of dictionary to be used for MeCab initialization. If you are using a system-installed dictionary,
                set this option to `None` and modify `mecab_option`.
            **mecab_option**: (`optional`) string
                String passed to MeCab constructor.
        """
        self.do_lower_case = do_lower_case
        self.never_split = never_split if never_split is not None else []
        self.normalize_text = normalize_text

        try:
            import fugashi
        except ModuleNotFoundError as error:
            raise error.__class__(
                "You need to install fugashi to use MecabTokenizer. "
                "See https://pypi.org/project/fugashi/ for installation.")

        mecab_option = mecab_option or ""

        if mecab_dic is not None:
            if mecab_dic == "ipadic":
                try:
                    import ipadic
                except ModuleNotFoundError as error:
                    raise error.__class__(
                        "The ipadic dictionary is not installed. "
                        "See https://github.com/polm/ipadic-py for installation."
                    )

                dic_dir = ipadic.DICDIR

            elif mecab_dic == "unidic_lite":
                try:
                    import unidic_lite
                except ModuleNotFoundError as error:
                    raise error.__class__(
                        "The unidic_lite dictionary is not installed. "
                        "See https://github.com/polm/unidic-lite for installation."
                    )

                dic_dir = unidic_lite.DICDIR

            elif mecab_dic == "unidic":
                try:
                    import unidic
                except ModuleNotFoundError as error:
                    raise error.__class__(
                        "The unidic dictionary is not installed. "
                        "See https://github.com/polm/unidic-py for installation."
                    )

                dic_dir = unidic.DICDIR
                if not os.path.isdir(dic_dir):
                    raise RuntimeError(
                        "The unidic dictionary itself is not found."
                        "See https://github.com/polm/unidic-py for installation."
                    )

            else:
                raise ValueError("Invalid mecab_dic is specified.")

            mecabrc = os.path.join(dic_dir, "mecabrc")
            mecab_option = f'-d "{dic_dir}" -r "{mecabrc}" ' + mecab_option

        self.mecab = fugashi.GenericTagger(mecab_option)
Ejemplo n.º 7
0
    def load(self, device: str):
        """
        Load user-selected task-specific model

        Args:
            device (str): device information

        Returns:
            object: User-selected task-specific model

        """
        if self.config.n_model == "g2p.ko":
            try:
                from g2pk import G2p as G2pK
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install g2pk with: `pip install g2pk`")
            model = G2pK()
            return PororoG2PKo(model, self.config)

        if self.config.n_model == "g2p.en":
            try:
                from g2p_en import G2p as G2pE
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install g2p_en with: `pip install g2p_en`")
            model = G2pE()
            return PororoG2PEn(model, self.config)

        if self.config.n_model == "g2p.zh":
            try:
                from g2pM import G2pM
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install g2pM with: `pip install g2pM`")
            model = G2pM()
            return PororoG2PZh(model, self.config)

        if self.config.n_model == "g2p.ja":
            try:
                import fugashi
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install fugashi with: `pip install fugashi`")

            try:
                import ipadic
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install ipadic with: `pip install ipadic`")

            try:
                import romkan
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install ipadic with: `pip install romkan`")
            dic_dir = ipadic.DICDIR
            mecabrc = os.path.join(dic_dir, "mecabrc")
            mecab_option = "-d {} -r {} ".format(
                dic_dir,
                mecabrc,
            )
            tagger = fugashi.GenericTagger(mecab_option)
            return PororoG2PJa(tagger, romkan.to_roma, self.config)
Ejemplo n.º 8
0
    def load(self, device: str):
        """
        Load user-selected task-specific model

        Args:
            device (str): device information

        Returns:
            object: User-selected task-specific model

        """
        if "sent" in self.config.n_model:
            import nltk

            try:
                nltk.data.find("tokenizers/punkt")
            except LookupError:
                nltk.download("punkt")

            from nltk.tokenize import sent_tokenize

            return PororoSentTokenizer(sent_tokenize, self.config)

        if self.config.n_model == "mecab_ko":
            try:
                import mecab
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install python-mecab-ko with: `pip install python-mecab-ko`"
                )
            model = mecab.MeCab()
            return PororoMecabKoTokenizer(model, self.config)

        if self.config.n_model == "char":
            return PororoCharTokenizer(self.config)

        if self.config.n_model == "jamo":
            return PororoJamoTokenizer(self.config)

        if self.config.n_model == "word":
            return PororoWordTokenizer(self.config)

        if self.config.n_model == "roberta":
            from fairseq.data.encoders.gpt2_bpe import get_encoder

            encoder = download_or_load("misc/encoder.json", self.config.lang)
            vocab = download_or_load("misc/vocab.bpe", self.config.lang)
            model = get_encoder(encoder, vocab)

            with open(encoder, "r") as f_vocab:
                vocab = json.load(f_vocab)
                inv_dict = {v: k for k, v in vocab.items()}

            return PororoRoBERTaTokenizer(model, vocab, inv_dict, self.config)

        if self.config.n_model == "moses":
            try:
                from sacremoses import MosesDetokenizer, MosesTokenizer
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install sacremoses with: `pip install sacremoses`")
            model = MosesTokenizer(lang="en")
            detok = MosesDetokenizer(lang="en")
            return PororoMosesTokenizer(model, detok, self.config)

        if self.config.n_model == "jieba":
            try:
                import jieba
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install jieba with: `pip install jieba`")
            model = jieba.cut
            return PororoJiebaTokenizer(model, self.config)

        if self.config.n_model == "mecab":
            try:
                import fugashi
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install fugashi with: `pip install fugashi`")

            try:
                import ipadic
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install ipadic with: `pip install ipadic`")

            dic_dir = ipadic.DICDIR
            mecabrc = os.path.join(dic_dir, "mecabrc")
            mecab_option = "-d {} -r {} ".format(
                dic_dir,
                mecabrc,
            )
            model = fugashi.GenericTagger(mecab_option)
            return PororoMecabTokenizer(model, self.config)
        else:
            from pororo.tasks.utils.tokenizer import CustomTokenizer

            path = download_or_load(
                f"tokenizers/{self.config.n_model}.zip",
                self.config.lang,
            )

            ext = "json" if "unigram" not in self.config.n_model else "txt"
            merges_filename = (f"{path}/merges.txt" if "unigram"
                               not in self.config.n_model else None)

            model = CustomTokenizer.from_file(
                vocab_filename=f"{path}/vocab.{ext}",
                merges_filename=merges_filename,
                normalize=True if "jpe" not in self.config.n_model else False,
            )
            if "jpe" in self.config.n_model:
                return PororoJamoPairTokenizer(model, self.config)
            if "mecab.bpe" in self.config.n_model:
                return PororoMecabSPTokenizer(model, self.config)
            return PororoSPTokenizer(model, self.config)
Ejemplo n.º 9
0
    def load(self, device: str):
        """
        Load user-selected task-specific model

        Args:
            device (str): device information

        Returns:
            object: User-selected task-specific model

        """
        if self.config.n_model == "nltk":
            import nltk

            try:
                nltk.data.find("tokenizers/punkt")
            except LookupError:
                nltk.download("punkt")

            try:
                nltk.data.find("taggers/averaged_perceptron_tagger")
            except LookupError:
                nltk.download("averaged_perceptron_tagger")
            return PororoNLTKPosTagger(nltk, self.config)

        if self.config.n_model == "mecab-ko":
            try:
                import mecab
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install python-mecab-ko with: `pip install python-mecab-ko`"
                )
            model = mecab.MeCab()
            return PororoMecabPos(model, self.config)

        if self.config.n_model == "mecab-ipadic":
            try:
                import fugashi
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install fugashi with: `pip install fugashi`")

            try:
                import ipadic
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install ipadic with: `pip install ipadic`")
            dic_dir = ipadic.DICDIR
            mecabrc = os.path.join(dic_dir, "mecabrc")
            mecab_option = "-d {} -r {} ".format(
                dic_dir,
                mecabrc,
            )
            model = fugashi.GenericTagger(mecab_option)
            return PororoMecabJap(model, self.config)

        if self.config.n_model == "jieba":
            try:
                import jieba  # noqa
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install jieba with: `pip install jieba`")
            import jieba.posseg as jieba_pos

            model = jieba_pos
            return PororoJieba(model, self.config)
Ejemplo n.º 10
0
    def __init__(
            self,
            do_lower_case=False,
            never_split=None,
            normalize_text=True,
            mecab_dic="ipadic",
            mecab_option=None, ):
        """
        Constructs a MecabTokenizer.

        Args:
            do_lower_case (bool): 
                Whether to lowercase the input. Defaults to`True`.
            never_split: (list): 
                Kept for backward compatibility purposes. Defaults to`None`.
            normalize_text (bool):
                Whether to apply unicode normalization to text before tokenization.  Defaults to`True`.
            mecab_dic (string):
                Name of dictionary to be used for MeCab initialization. If you are using a system-installed dictionary,
                set this option to `None` and modify `mecab_option`. Defaults to`ipadic`.
            mecab_option (string):
                String passed to MeCab constructor. Defaults to`None`.
        """
        self.do_lower_case = do_lower_case
        self.never_split = never_split if never_split is not None else []
        self.normalize_text = normalize_text

        try:
            import fugashi
        except ModuleNotFoundError as error:
            raise error.__class__(
                "You need to install fugashi to use MecabTokenizer. "
                "See https://pypi.org/project/fugashi/ for installation.")

        mecab_option = mecab_option or ""

        if mecab_dic is not None:
            if mecab_dic == "ipadic":
                try:
                    import ipadic
                except ModuleNotFoundError as error:
                    raise error.__class__(
                        "The ipadic dictionary is not installed. "
                        "See https://github.com/polm/ipadic-py for installation."
                    )

                dic_dir = ipadic.DICDIR

            elif mecab_dic == "unidic_lite":
                try:
                    import unidic_lite
                except ModuleNotFoundError as error:
                    raise error.__class__(
                        "The unidic_lite dictionary is not installed. "
                        "See https://github.com/polm/unidic-lite for installation."
                    )

                dic_dir = unidic_lite.DICDIR

            elif mecab_dic == "unidic":
                try:
                    import unidic
                except ModuleNotFoundError as error:
                    raise error.__class__(
                        "The unidic dictionary is not installed. "
                        "See https://github.com/polm/unidic-py for installation."
                    )

                dic_dir = unidic.DICDIR
                if not os.path.isdir(dic_dir):
                    raise RuntimeError(
                        "The unidic dictionary itself is not found."
                        "See https://github.com/polm/unidic-py for installation."
                    )
            else:
                raise ValueError("Invalid mecab_dic is specified.")

            mecabrc = os.path.join(dic_dir, "mecabrc")
            mecab_option = f'-d "{dic_dir}" -r "{mecabrc}" ' + mecab_option

        self.mecab = fugashi.GenericTagger(mecab_option)
Ejemplo n.º 11
0
import os

import ipadic
import fugashi

dic_dir = ipadic.DICDIR
mecabrc = os.path.join(dic_dir, "mecabrc")
mecab_option = f"-d {dic_dir} -r {mecabrc}"
tagger = fugashi.GenericTagger(mecab_option)


def tokenize(text):
    """
    A method for word segmentation.

    Parameters
    ----------
    text : str
        An input text

    Returns
    -------
    words : list
        A list of words
    """

    words = [word.surface for word in tagger(text)]
    return words


def original_usage(text):