예제 #1
0
    def _check_input_params(self):
        """Look at combination of parameters give to class
        and determine if any invalid combination or missing
        models.
        """

        # 1. check if lang valid
        get_lang(self.iso_code)  # check if iso_code valid

        # 2. check if any fasttext embeddings for this lang
        if not self._is_fasttext_lang_available():
            available_embeddings_str = "', '".join(
                MAP_LANGS_CLTK_FASTTEXT.keys())
            raise UnimplementedAlgorithmError(
                f"No embedding available for language '{self.iso_code}'. FastTextEmbeddings available for: '{available_embeddings_str}'."
            )

        # 3. check if requested model type is available for fasttext
        valid_model_types = ["bin", "vec"]
        if self.model_type not in valid_model_types:
            valid_model_types_str = "', '"
            raise CLTKException(
                f"Invalid model type '{self.model_type}'. Choose: '{valid_model_types_str}'."
            )

        # 4. check if requested training set is available for language for fasttext
        training_sets = ["common_crawl", "wiki"]
        if self.training_set not in training_sets:
            training_sets_str = "', '".join(training_sets)
            raise CLTKException(
                f"Invalid ``training_set`` '{self.training_set}'. Available: '{training_sets_str}'."
            )
        available_vectors = list()
        if self.training_set == "wiki":
            available_vectors = [
                "ang", "arb", "arc", "got", "lat", "pli", "san"
            ]
        elif self.training_set == "common_crawl":
            available_vectors = ["arb", "lat", "san"]
        else:
            CLTKException("Unanticipated exception.")
        if self.iso_code in available_vectors:
            pass
        else:
            available_vectors_str = "', '".join(available_vectors)
            raise CLTKException(
                f"Training set '{self.training_set}' not available for language '{self.iso_code}'. Languages available for this training set: '{available_vectors_str}'."
            )
예제 #2
0
    def _check_input_params(self) -> None:
        """Confirm that input parameters are valid and in a
        valid configuration.
        """
        # 1. check if lang valid
        get_lang(self.iso_code)  # check if iso_code valid

        # 2. check if any fasttext embeddings for this lang
        if self.iso_code not in self.MAP_LANG_TO_URL:
            available_embeddings_str = "', '".join(self.MAP_LANG_TO_URL.keys())
            raise UnimplementedAlgorithmError(
                f"No embedding available for language '{self.iso_code}'. Word2Vec models available for: '{available_embeddings_str}'."
            )

        # 3. assert that model type is valid
        valid_types = ["bin", "txt"]
        if self.model_type not in valid_types:
            unavailable_types_str = "', '".join(valid_types)
            raise ValueError(
                f"Invalid ``model_type`` {self.model_type}. Valid model types: {unavailable_types_str}."
            )
예제 #3
0
def get_example_text(iso_code: str) -> str:
    """Take in search term of usual language name and find ISO code.

    >>> from cltk.languages.example_texts import get_example_text
    >>> get_example_text("got")[:25]
    'swa liuhtjai liuhaþ izwar'
    >>> get_example_text("zkz")
    Traceback (most recent call last):
      ...
    cltk.core.exceptions.UnimplementedAlgorithmError: Example text unavailable for ISO 639-3 code 'zkz'.
    >>> get_example_text("xxx")
    Traceback (most recent call last):
      ...
    cltk.core.exceptions.UnknownLanguageError: Unknown ISO language code 'xxx'.
    """
    get_lang(iso_code=iso_code)
    try:
        return EXAMPLE_TEXTS[iso_code]
    except KeyError:
        raise UnimplementedAlgorithmError(
            f"Example text unavailable for ISO 639-3 code '{iso_code}'.")
예제 #4
0
    def _get_pipeline(self) -> Pipeline:
        """Select appropriate pipeline for given language. If custom
        processing is requested, ensure that user-selected choices
        are valid, both in themselves and in unison.

        >>> from cltk.core.data_types import Pipeline
        >>> cltk_nlp = NLP(language="lat", suppress_banner=True)
        >>> lat_pipeline = cltk_nlp._get_pipeline()
        >>> isinstance(cltk_nlp.pipeline, Pipeline)
        True
        >>> isinstance(lat_pipeline, Pipeline)
        True
        >>> cltk_nlp = NLP(language="axm", suppress_banner=True)
        Traceback (most recent call last):
          ...
        cltk.core.exceptions.UnimplementedAlgorithmError: Valid ISO language code, however this algorithm is not available for ``axm``.
        """
        try:
            return iso_to_pipeline[self.language.iso_639_3_code]()
        except KeyError:
            raise UnimplementedAlgorithmError(
                f"Valid ISO language code, however this algorithm is not available for ``{self.language.iso_639_3_code}``."
            )
예제 #5
0
    def transliterate(text, mode="Latin"):
        """
        Transliterates Anglo-Saxon runes into lat and vice versa.

        Sources:
            http://www.arild-hauge.com/eanglor.htm
            https://en.wikipedia.org/wiki/Anglo-Saxon_runes

        :param text: str: The text to be transcribed
        :param mode: Specifies transliteration mode, options:

            Latin (default): Transliterates Anglo-Saxon runes into the lat
            alphabet, using the Dickins system

            Anglo-Saxon/Anglo-Frisian : Transliterates Latin text into Anglo-Saxon runes

        Examples:

        >>> Transliterate().transliterate("Hƿæt Ƿe Gardena in geardagum", "Anglo-Saxon")
        'ᚻᚹᚫᛏ ᚹᛖ ᚷᚪᚱᛞᛖᚾᚪ ᛁᚾ ᚷᛠᚱᛞᚪᚷᚢᛗ'

        >>> Transliterate().transliterate("ᚩᚠᛏ ᛋᚳᚣᛚᛞ ᛋᚳᛖᚠᛁᛝ ᛋᚳᛠᚦᛖᚾᚪ ᚦᚱᛠᛏᚢᛗ", "Latin")
        'oft scyld scefin sceathena threatum'
        """
        if mode == "Latin":
            return Transliterate.__transliterate_helper(
                text, L_Transliteration)

        elif mode in ["Anglo-Saxon", "Anglo-Frisian"]:
            return Transliterate.__transliterate_helper(
                text, R_Transliteration)

        else:
            LOG.error("The specified mode is currently not supported")
            raise UnimplementedAlgorithmError(
                "The specified mode is currently not supported")
예제 #6
0
def tag_ner(iso_code: str, input_tokens: List[str]) -> List[Union[bool, str]]:
    """Run NER for chosen language. Some languages return boolean True/False,
    others give string of entity type (e.g., ``LOC``).

    >>> from cltk.ner.ner import tag_ner
    >>> from cltk.languages.example_texts import get_example_text
    >>> from boltons.strutils import split_punct_ws
    >>> tokens = split_punct_ws(get_example_text(iso_code="lat"))

    >>> text = "ἐπὶ δ᾽ οὖν τοῖς πρώτοις τοῖσδε Περικλῆς ὁ Ξανθίππου ᾑρέθη λέγειν. καὶ ἐπειδὴ καιρὸς ἐλάμβανε, προελθὼν ἀπὸ τοῦ σήματος ἐπὶ βῆμα ὑψηλὸν πεποιημένον, ὅπως ἀκούοιτο ὡς ἐπὶ πλεῖστον τοῦ ὁμίλου, ἔλεγε τοιάδε."
    >>> tokens = split_punct_ws(text)
    >>> are_words_entities = tag_ner(iso_code="grc", input_tokens=tokens)
    >>> tokens[:9]
    ['ἐπὶ', 'δ᾽', 'οὖν', 'τοῖς', 'πρώτοις', 'τοῖσδε', 'Περικλῆς', 'ὁ', 'Ξανθίππου']
    >>> are_words_entities[:9] # TODO check this result
    [False, False, False, False, False, False, False, False, False]

    >>> tokens = split_punct_ws(get_example_text(iso_code="fro"))
    >>> are_words_entities = tag_ner(iso_code="fro", input_tokens=tokens)
    >>> tokens[30:50]
    ['Bretaigne', 'A', 'I', 'molt', 'riche', 'chevalier', 'Hardi', 'et', 'coragous', 'et', 'fier', 'De', 'la', 'Table', 'Reonde', 'estoit', 'Le', 'roi', 'Artu', 'que']
    >>> are_words_entities[30:50]
    ['LOC', False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, 'CHI']
    """

    get_lang(iso_code=iso_code)
    if iso_code not in NER_DICT:
        msg = f"NER unavailable for language ``{iso_code}``."
        raise UnimplementedAlgorithmError(msg)
    ner_file_path = os.path.expanduser(NER_DICT[iso_code])
    if iso_code == "fro":
        if not os.path.isfile(ner_file_path):
            msg = f"Old French model path '{ner_file_path}' not found. Going to try to download it ..."
            logging.warning(msg)
            dl_msg = f"This part of the CLTK depends upon models from the CLTK project."
            model_url = "https://github.com/cltk/fro_models_cltk"
            download_prompt(iso_code=iso_code,
                            message=dl_msg,
                            model_url=model_url)
        loader = importlib.machinery.SourceFileLoader("entities",
                                                      ner_file_path)
        module = loader.load_module()  # type: module
        entities = module.entities  # type: Tuple(str, str)
        entities_type_list = list()
        for input_token in input_tokens:
            for entity_token, kind in entities:
                if input_token == entity_token:
                    entities_type_list.append(kind)
                    break
            entities_type_list.append(False)
        return entities_type_list
    elif iso_code in ["ang", "grc", "lat"]:
        return spacy_tag_ner(iso_code=iso_code,
                             text_tokens=input_tokens,
                             model_path=NER_DICT[iso_code])  # List[str, None]
    else:
        with open(ner_file_path) as file_open:
            ner_str = file_open.read()
        ner_list = ner_str.split("\n")
        is_entity_list = list()  # type: List[bool]
        for word_token in input_tokens:
            if word_token in ner_list:
                is_entity_list.append(True)
            else:
                is_entity_list.append(False)
        return is_entity_list
예제 #7
0
    def __init__(
        self,
        language: str,
        treebank: Optional[str] = None,
        stanza_debug_level="ERROR",
        interactive: bool = True,
        silent: bool = False,
    ) -> None:
        """Constructor for ``get_stanza_models`` wrapper class.

        >>> stanza_wrapper = StanzaWrapper(language='grc', stanza_debug_level="INFO", interactive=False)
        >>> isinstance(stanza_wrapper, StanzaWrapper)
        True
        >>> stanza_wrapper.language
        'grc'
        >>> stanza_wrapper.treebank
        'proiel'

        >>> stanza_wrapper = StanzaWrapper(language="grc", treebank="perseus", stanza_debug_level="INFO", interactive=False)
        >>> isinstance(stanza_wrapper, StanzaWrapper)
        True
        >>> stanza_wrapper.language
        'grc'
        >>> stanza_wrapper.treebank
        'perseus'
        >>> from cltk.languages.example_texts import get_example_text
        >>> stanza_doc = stanza_wrapper.parse(get_example_text("grc"))

        >>> StanzaWrapper(language="xxx", stanza_debug_level="INFO")
        Traceback (most recent call last):
          ...
        cltk.core.exceptions.UnknownLanguageError: Language 'xxx' either not in scope for CLTK or not supported by Stanza.

        >>> stanza_wrapper = StanzaWrapper(language="grc", treebank="proiel", stanza_debug_level="INFO", interactive=False)
        >>> stanza_doc = stanza_wrapper.parse(get_example_text("grc"))

        >>> stanza_wrapper = StanzaWrapper(language="lat", treebank="perseus", stanza_debug_level="INFO", interactive=False)
        >>> stanza_doc = stanza_wrapper.parse(get_example_text("lat"))

        >>> stanza_wrapper = StanzaWrapper(language="lat", treebank="proiel", stanza_debug_level="INFO", interactive=False)
        >>> stanza_doc = stanza_wrapper.parse(get_example_text("lat"))

        >>> stanza_wrapper = StanzaWrapper(language="chu", stanza_debug_level="INFO", interactive=False)
        >>> stanza_doc = stanza_wrapper.parse(get_example_text("chu"))

        >>> stanza_wrapper = StanzaWrapper(language="cop", stanza_debug_level="INFO", interactive=False)  # doctest: +SKIP
        >>> stanza_doc = stanza_wrapper.parse(get_example_text("cop"))  # doctest: +SKIP

        >>> stanza_wrapper = StanzaWrapper(language="lzh", stanza_debug_level="INFO", interactive=False)
        >>> stanza_doc = stanza_wrapper.parse(get_example_text("lzh"))

        >>> stanza_wrapper = StanzaWrapper(language="lat", treebank="xxx", stanza_debug_level="INFO")
        Traceback (most recent call last):
          ...
        cltk.core.exceptions.UnimplementedAlgorithmError: Invalid treebank 'xxx' for language 'lat'.
        """
        self.language = language
        self.treebank = treebank
        self.stanza_debug_level = stanza_debug_level
        self.interactive = interactive
        self.silent = silent

        if self.interactive and self.silent:
            raise ValueError(
                "``interactive`` and ``silent`` options are not compatible with each other."
            )

        # Setup language
        self.map_langs_cltk_stanza = {
            "chu": "Old_Church_Slavonic",
            "cop": "Coptic",
            "fro": "Old_French",
            "grc": "Ancient_Greek",
            "got": "Gothic",
            "lat": "Latin",
            "lzh": "Classical_Chinese",
        }

        self.wrapper_available = self.is_wrapper_available()  # type: bool
        if not self.wrapper_available:
            raise UnknownLanguageError(
                "Language '{}' either not in scope for CLTK or not supported by Stanza.".format(
                    self.language
                )
            )
        self.stanza_code = self._get_stanza_code()

        # Setup optional treebank if specified
        # TODO: Write tests for all treebanks
        self.map_code_treebanks = dict(
            grc=["proiel", "perseus"], la=["perseus", "proiel", "ittb"]
        )
        # if not specified, will use the default treebank chosen by stanza
        if self.treebank:
            valid_treebank = self._is_valid_treebank()
            if not valid_treebank:
                raise UnimplementedAlgorithmError(
                    f"Invalid treebank '{self.treebank}' for language '{self.language}'."
                )
        else:
            self.treebank = self._get_default_treebank()

        # check if model present
        # this fp is just to confirm that some model has already been downloaded.
        # TODO: This is a weak check for the models actually being downloaded and valid
        # TODO: Use ``models_dir`` var from below and make self. or global to module
        self.model_path = os.path.expanduser(
            f"~/stanza_resources/{self.stanza_code}/tokenize/{self.treebank}.pt"
        )
        if not self._is_model_present():
            # download model if necessary
            self._download_model()

        # instantiate actual stanza class
        # Note: `suppress_stdout` is used to prevent `stanza`
        # from printing a long log of its parameters to screen.
        # Though we should capture these, within `_load_pipeline()`,
        # for the log file.
        with suppress_stdout():
            self.nlp = self._load_pipeline()