コード例 #1
0
def tag_ner(iso_code: str, input_tokens: List[str]) -> List[Union[bool, str]]:
    """Run NER for chosen language. Some languages return boolean True/False,
    others give string of entity type (e.g., ``LOC``).

    >>> from cltk.ner.ner import tag_ner
    >>> from cltk.languages.example_texts import get_example_text
    >>> from boltons.strutils import split_punct_ws
    >>> tokens = split_punct_ws(get_example_text(iso_code="lat"))
    >>> are_words_entities = tag_ner(iso_code="lat", input_tokens=tokens)
    >>> tokens[:5]
    ['Gallia', 'est', 'omnis', 'divisa', 'in']
    >>> are_words_entities[:5]
    [True, False, False, False, False]

    >>> text = "ἐπὶ δ᾽ οὖν τοῖς πρώτοις τοῖσδε Περικλῆς ὁ Ξανθίππου ᾑρέθη λέγειν. καὶ ἐπειδὴ καιρὸς ἐλάμβανε, προελθὼν ἀπὸ τοῦ σήματος ἐπὶ βῆμα ὑψηλὸν πεποιημένον, ὅπως ἀκούοιτο ὡς ἐπὶ πλεῖστον τοῦ ὁμίλου, ἔλεγε τοιάδε."
    >>> tokens = split_punct_ws(text)
    >>> are_words_entities = tag_ner(iso_code="grc", input_tokens=tokens)
    >>> tokens[:9]
    ['ἐπὶ', 'δ᾽', 'οὖν', 'τοῖς', 'πρώτοις', 'τοῖσδε', 'Περικλῆς', 'ὁ', 'Ξανθίππου']
    >>> are_words_entities[:9]
    [False, False, False, False, False, False, True, False, True]

    >>> tokens = split_punct_ws(get_example_text(iso_code="fro"))
    >>> are_words_entities = tag_ner(iso_code="fro", input_tokens=tokens)
    >>> tokens[30:50]
    ['Bretaigne', 'A', 'I', 'molt', 'riche', 'chevalier', 'Hardi', 'et', 'coragous', 'et', 'fier', 'De', 'la', 'Table', 'Reonde', 'estoit', 'Le', 'roi', 'Artu', 'que']
    >>> are_words_entities[30:50]
    ['LOC', False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, 'CHI']
    """

    get_lang(iso_code=iso_code)
    if iso_code not in NER_DICT:
        msg = f"NER unavailable for language ``{iso_code}``."
        raise UnimplementedAlgorithmError(msg)
    ner_file_path = os.path.expanduser(NER_DICT[iso_code])
    if iso_code == "fro":
        loader = importlib.machinery.SourceFileLoader("entities", ner_file_path)
        module = loader.load_module()  # type: module
        entities = module.entities  # type: Tuple(str, str)
        entities_type_list = list()
        for input_token in input_tokens:
            for entity_token, kind in entities:
                if input_token == entity_token:
                    entities_type_list.append(kind)
                    break
            entities_type_list.append(False)
        return entities_type_list
    else:
        with open(ner_file_path) as file_open:
            ner_str = file_open.read()
        ner_list = ner_str.split("\n")
        is_entity_list = list()  # type: List[bool]
        for word_token in input_tokens:
            if word_token in ner_list:
                is_entity_list.append(True)
            else:
                is_entity_list.append(False)
        return is_entity_list
コード例 #2
0
ファイル: embeddings.py プロジェクト: kylepjohnson/cltkv1
 def _is_fasttext_lang_available(self) -> bool:
     """Returns whether any vectors are available, for
     fastText, for the input language. This is not comprehensive
     of all fastText embeddings, only those added into the CLTK.
     """
     get_lang(iso_code=self.iso_code)
     if self.iso_code not in self.MAP_LANGS_CLTK_FASTTEXT:
         return False
     else:
         return True
コード例 #3
0
ファイル: embeddings.py プロジェクト: kylepjohnson/cltkv1
    def _check_input_params(self):
        """Look at combination of parameters give to class
        and determine if any invalid combination or missing
        models.
        """

        # 1. check if lang valid
        get_lang(self.iso_code)  # check if iso_code valid

        # 2. check if any fasttext embeddings for this lang
        if not self._is_fasttext_lang_available():
            available_embeddings_str = "', '".join(
                self.MAP_LANGS_CLTK_FASTTEXT.keys())
            raise UnimplementedAlgorithmError(
                f"No embedding available for language '{self.iso_code}'. FastTextEmbeddings available for: '{available_embeddings_str}'."
            )

        # 3. check if requested model type is available for fasttext
        valid_model_types = ["bin", "vec"]
        if self.model_type not in valid_model_types:
            valid_model_types_str = "', '"
            raise CLTKException(
                f"Invalid model type '{self.model_type}'. Choose: '{valid_model_types_str}'."
            )

        # 4. check if requested training set is available for language for fasttext
        training_sets = ["common_crawl", "wiki"]
        if self.training_set not in training_sets:
            training_sets_str = "', '".join(training_sets)
            raise CLTKException(
                f"Invalid ``training_set`` '{self.training_set}'. Available: '{training_sets_str}'."
            )
        available_vectors = list()
        if self.training_set == "wiki":
            available_vectors = [
                "ang", "arb", "arc", "got", "lat", "pli", "san"
            ]
        elif self.training_set == "common_crawl":
            available_vectors = ["arb", "lat", "san"]
        else:
            CLTKException("Unanticipated exception.")
        if self.iso_code in available_vectors:
            pass
        else:
            available_vectors_str = "', '".join(available_vectors)
            raise CLTKException(
                f"Training set '{self.training_set}' not available for language '{self.iso_code}'. Languages available for this training set: '{available_vectors_str}'."
            )
コード例 #4
0
    def __init__(
        self,
        language: str,
        custom_pipeline: Pipeline = None,
        suppress_banner: bool = False,
    ) -> None:
        """Constructor for CLTK class.

        Args:
            language: ISO code
            custom_pipeline: Optional ``Pipeline`` for processing text.


        >>> from cltk import NLP
        >>> cltk_nlp = NLP(language="lat", suppress_banner=True)
        >>> isinstance(cltk_nlp, NLP)
        True
        >>> from cltk.core.data_types import Pipeline
        >>> from cltk.tokenizers import LatinTokenizationProcess
        >>> from cltk.languages.utils import get_lang
        >>> a_pipeline = Pipeline(description="A custom Latin pipeline", processes=[LatinTokenizationProcess], language=get_lang("lat"))
        >>> nlp = NLP(language="lat", custom_pipeline=a_pipeline, suppress_banner=True)
        >>> nlp.pipeline is a_pipeline
        True
        """
        self.language = get_lang(language)  # type: Language
        self.pipeline = custom_pipeline if custom_pipeline else self._get_pipeline(
        )
        if not suppress_banner:
            self._print_pipelines_for_current_lang()
コード例 #5
0
ファイル: pipelines.py プロジェクト: free-variation/cltk
class LatinPipeline(Pipeline):
    """Default ``Pipeline`` for Latin.

    TODO: Add stopword annotation for all relevant pipelines.

    >>> from cltk.languages.pipelines import LatinPipeline
    >>> a_pipeline = LatinPipeline()
    >>> a_pipeline.description
    'Pipeline for the Latin language'
    >>> a_pipeline.language
    Language(name='Latin', glottolog_id='lati1261', latitude=41.9026, longitude=12.4502, dates=[], family_id='indo1319', parent_id='impe1234', level='language', iso_639_3_code='lat', type='a')
    >>> a_pipeline.language.name
    'Latin'
    >>> a_pipeline.processes[0]
    <class 'cltk.dependency.processes.LatinStanzaProcess'>
    """

    description: str = "Pipeline for the Latin language"
    language: Language = get_lang("lat")
    processes: List[Type[Process]] = field(default_factory=lambda: [
        LatinNormalizeProcess,
        # LatinTokenizationProcess,
        LatinStanzaProcess,
        LatinEmbeddingsProcess,
        StopsProcess,
        LatinNERProcess,
        LatinLexiconProcess,
    ])
コード例 #6
0
ファイル: pipelines.py プロジェクト: free-variation/cltk
class GreekPipeline(Pipeline):
    """Default ``Pipeline`` for Ancient Greek.

    >>> from cltk.languages.pipelines import GreekPipeline
    >>> a_pipeline = GreekPipeline()
    >>> a_pipeline.description
    'Pipeline for the Greek language'
    >>> a_pipeline.language
    Language(name='Ancient Greek', glottolog_id='anci1242', latitude=39.8155, longitude=21.9129, dates=[], family_id='indo1319', parent_id='east2798', level='language', iso_639_3_code='grc', type='h')
    >>> a_pipeline.language.name
    'Ancient Greek'
    >>> a_pipeline.processes[0]
    <class 'cltk.dependency.processes.GreekStanzaProcess'>
    """

    description: str = "Pipeline for the Greek language"
    language: Language = get_lang("grc")
    processes: List[Type[Process]] = field(default_factory=lambda: [
        # GreekTokenizationProcess,
        GreekNormalizeProcess,
        GreekStanzaProcess,
        GreekEmbeddingsProcess,
        StopsProcess,
        GreekNERProcess,
    ])
コード例 #7
0
    def __init__(self, language: str, testing: bool = False):
        """Setup corpus importing.

        `testing` is a hack to check a tmp .yaml file to look at
        or local corpus. This keeps from overwriting local. A
        better idea is probably to refuse to overwrite the .yaml.
        """

        self.language = language.lower()
        if self.language != "multilingual":
            get_lang(iso_code=language)

        assert isinstance(testing,
                          bool), "``testing`` parameter must be boolean type"
        self.testing = testing

        self.user_defined_corpora = self._get_user_defined_corpora()
        self.library_defined_corpora = self._get_library_defined_corpora()
        self.all_corpora_for_lang = (self.user_defined_corpora +
                                     self.library_defined_corpora)
コード例 #8
0
ファイル: example_texts.py プロジェクト: yelircaasi/cltk
def get_example_text(iso_code: str) -> str:
    """Take in search term of usual language name and find ISO code.

    >>> from cltk.languages.example_texts import get_example_text
    >>> get_example_text("got")[:25]
    'swa liuhtjai liuhaþ izwar'
    >>> get_example_text("zkz")
    Traceback (most recent call last):
      ...
    cltk.core.exceptions.UnimplementedAlgorithmError: Example text unavailable for ISO 639-3 code 'zkz'.
    >>> get_example_text("xxx")
    Traceback (most recent call last):
      ...
    cltk.core.exceptions.UnknownLanguageError: Unknown ISO language code 'xxx'.
    """
    get_lang(iso_code=iso_code)
    try:
        return EXAMPLE_TEXTS[iso_code]
    except KeyError:
        raise UnimplementedAlgorithmError(
            f"Example text unavailable for ISO 639-3 code '{iso_code}'.")
コード例 #9
0
ファイル: embeddings.py プロジェクト: kylepjohnson/cltkv1
    def _check_input_params(self) -> None:
        """Confirm that input parameters are valid and in a
        valid configuration.
        """
        # 1. check if lang valid
        get_lang(self.iso_code)  # check if iso_code valid

        # 2. check if any fasttext embeddings for this lang
        if self.iso_code not in self.MAP_LANG_TO_URL:
            available_embeddings_str = "', '".join(self.MAP_LANG_TO_URL.keys())
            raise UnimplementedAlgorithmError(
                f"No embedding available for language '{self.iso_code}'. Word2Vec models available for: '{available_embeddings_str}'."
            )

        # 3. assert that model type is valid
        valid_types = ["bin", "txt"]
        if self.model_type not in valid_types:
            unavailable_types_str = "', '".join(valid_types)
            raise ValueError(
                f"Invalid ``model_type`` {self.model_type}. Valid model types: {unavailable_types_str}."
            )
コード例 #10
0
ファイル: pipelines.py プロジェクト: free-variation/cltk
class OCSPipeline(Pipeline):
    """Default ``Pipeline`` for Old Church Slavonic.

    >>> from cltk.languages.pipelines import OCSPipeline
    >>> a_pipeline = OCSPipeline()
    >>> a_pipeline.description
    'Pipeline for the Old Church Slavonic language'
    >>> a_pipeline.language
    Language(name='Church Slavic', glottolog_id='chur1257', latitude=43.7171, longitude=22.8442, dates=[], family_id='indo1319', parent_id='east2269', level='language', iso_639_3_code='chu', type='a')
    >>> a_pipeline.language.name
    'Church Slavic'
    >>> a_pipeline.processes[0]
    <class 'cltk.dependency.processes.OCSStanzaProcess'>
    """

    description: str = "Pipeline for the Old Church Slavonic language"
    language: Language = get_lang("chu")
    processes: List[Type[Process]] = field(
        default_factory=lambda: [OCSStanzaProcess])
コード例 #11
0
ファイル: pipelines.py プロジェクト: free-variation/cltk
class MiddleHighGermanPipeline(Pipeline):
    """Default ``Pipeline`` for Middle High German.

    >>> a_pipeline = MiddleHighGermanPipeline()
    >>> a_pipeline.description
    'Pipeline for the Middle High German language.'
    >>> a_pipeline.language
    Language(name='Middle High German', glottolog_id='midd1343', latitude=0.0, longitude=0.0, dates=[], family_id='indo1319', parent_id='midd1349', level='language', iso_639_3_code='gmh', type='h')
    >>> a_pipeline.language.name
    'Middle High German'
    >>> a_pipeline.processes[0]
    <class 'cltk.tokenizers.processes.MiddleHighGermanTokenizationProcess'>
    """

    description: str = "Pipeline for the Middle High German language."
    language: Language = get_lang("gmh")
    processes: List[Type[Process]] = field(
        default_factory=lambda:
        [MiddleHighGermanTokenizationProcess, StopsProcess])
コード例 #12
0
ファイル: pipelines.py プロジェクト: kylepjohnson/cltkv1
class OldNorsePipeline(Pipeline):
    """Default ``Pipeline`` for Old Norse.

    >>> from cltk.languages.pipelines import OldNorsePipeline
    >>> a_pipeline = OldNorsePipeline()
    >>> a_pipeline.description
    'Pipeline for the Old Norse language'
    >>> a_pipeline.language
    Language(name='Old Norse', glottolog_id='oldn1244', latitude=63.42, longitude=10.38, dates=[], family_id='indo1319', parent_id='west2805', level='language', iso_639_3_code='non', type='h')
    >>> a_pipeline.language.name
    'Old Norse'
    >>> a_pipeline.processes[0]
    <class 'cltk.tokenizers.processes.OldNorseTokenizationProcess'>
    """

    description: str = "Pipeline for the Old Norse language"
    language: Language = get_lang("non")
    processes: List[Type[Process]] = field(
        default_factory=lambda: [OldNorseTokenizationProcess, StopsProcess])
コード例 #13
0
ファイル: pipelines.py プロジェクト: free-variation/cltk
class CopticPipeline(Pipeline):
    """Default ``Pipeline`` for Coptic.

    >>> from cltk.languages.pipelines import CopticPipeline
    >>> a_pipeline = CopticPipeline()
    >>> a_pipeline.description
    'Pipeline for the Coptic language'
    >>> a_pipeline.language
    Language(name='Coptic', glottolog_id='copt1239', latitude=29.472, longitude=31.2053, dates=[], family_id='afro1255', parent_id='egyp1245', level='language', iso_639_3_code='cop', type='')
    >>> a_pipeline.language.name
    'Coptic'
    >>> a_pipeline.processes[0]
    <class 'cltk.dependency.processes.CopticStanzaProcess'>
    """

    description: str = "Pipeline for the Coptic language"
    language: Language = get_lang("cop")
    processes: List[Type[Process]] = field(
        default_factory=lambda: [CopticStanzaProcess, StopsProcess])
コード例 #14
0
ファイル: pipelines.py プロジェクト: free-variation/cltk
class ChinesePipeline(Pipeline):
    """Default ``Pipeline`` for Classical Chinese.

    >>> from cltk.languages.pipelines import ChinesePipeline
    >>> a_pipeline = ChinesePipeline()
    >>> a_pipeline.description
    'Pipeline for the Classical Chinese language'
    >>> a_pipeline.language
    Language(name='Literary Chinese', glottolog_id='lite1248', latitude=0.0, longitude=0.0, dates=[], family_id='sino1245', parent_id='clas1255', level='language', iso_639_3_code='lzh', type='h')
    >>> a_pipeline.language.name
    'Literary Chinese'
    >>> a_pipeline.processes[0]
    <class 'cltk.dependency.processes.ChineseStanzaProcess'>
    """

    description: str = "Pipeline for the Classical Chinese language"
    language: Language = get_lang("lzh")
    processes: List[Type[Process]] = field(
        default_factory=lambda: [ChineseStanzaProcess])
コード例 #15
0
ファイル: pipelines.py プロジェクト: free-variation/cltk
class AkkadianPipeline(Pipeline):
    """Default ``Pipeline`` for Akkadian.

    >>> from cltk.languages.pipelines import AkkadianPipeline
    >>> a_pipeline = AkkadianPipeline()
    >>> a_pipeline.description
    'Pipeline for the Akkadian language.'
    >>> a_pipeline.language
    Language(name='Akkadian', glottolog_id='akka1240', latitude=33.1, longitude=44.1, dates=[], family_id='afro1255', parent_id='east2678', level='language', iso_639_3_code='akk', type='a')
    >>> a_pipeline.language.name
    'Akkadian'
    >>> a_pipeline.processes[0]
    <class 'cltk.tokenizers.processes.AkkadianTokenizationProcess'>
    """

    description: str = "Pipeline for the Akkadian language."
    language: Language = get_lang("akk")
    processes: List[Type[Process]] = field(
        default_factory=lambda: [AkkadianTokenizationProcess, StopsProcess])
コード例 #16
0
ファイル: pipelines.py プロジェクト: free-variation/cltk
class PanjabiPipeline(Pipeline):
    """Default ``Pipeline`` for Panjabi.

    >>> from cltk.languages.pipelines import SanskritPipeline
    >>> a_pipeline = PanjabiPipeline()
    >>> a_pipeline.description
    'Pipeline for the Panjabi language.'
    >>> a_pipeline.language
    Language(name='Eastern Panjabi', glottolog_id='panj125', latitude=30.0368, longitude=75.6702, dates=[], family_id='indo1319', parent_id='east2727', level='language', iso_639_3_code='pan', type='')
    >>> a_pipeline.language.name
    'Eastern Panjabi'
    >>> a_pipeline.processes[1]
    <class 'cltk.stops.processes.StopsProcess'>
    """

    description: str = "Pipeline for the Panjabi language."
    language: Language = get_lang("pan")
    processes: List[Type[Process]] = field(
        default_factory=lambda:
        [MultilingualTokenizationProcess, StopsProcess])
コード例 #17
0
ファイル: pipelines.py プロジェクト: free-variation/cltk
class HindiPipeline(Pipeline):
    """Default ``Pipeline`` for Hindi.

    >>> from cltk.languages.pipelines import HindiPipeline
    >>> a_pipeline = HindiPipeline()
    >>> a_pipeline.description
    'Pipeline for the Hindi language.'
    >>> a_pipeline.language
    Language(name='Hindi', glottolog_id='hind1269', latitude=25.0, longitude=77.0, dates=[], family_id='indo1319', parent_id='hind1270', level='language', iso_639_3_code='hin', type='')
    >>> a_pipeline.language.name
    'Hindi'
    >>> a_pipeline.processes[1]
    <class 'cltk.stops.processes.StopsProcess'>
    """

    description: str = "Pipeline for the Hindi language."
    language: Language = get_lang("hin")
    processes: List[Type[Process]] = field(
        default_factory=lambda:
        [MultilingualTokenizationProcess, StopsProcess])
コード例 #18
0
ファイル: pipelines.py プロジェクト: free-variation/cltk
class GothicPipeline(Pipeline):
    """Default ``Pipeline`` for Gothic.

    >>> from cltk.languages.pipelines import GothicPipeline
    >>> a_pipeline = GothicPipeline()
    >>> a_pipeline.description
    'Pipeline for the Gothic language'
    >>> a_pipeline.language
    Language(name='Gothic', glottolog_id='goth1244', latitude=46.9304, longitude=29.9786, dates=[], family_id='indo1319', parent_id='east2805', level='language', iso_639_3_code='got', type='a')
    >>> a_pipeline.language.name
    'Gothic'
    >>> a_pipeline.processes[0]
    <class 'cltk.dependency.processes.GothicStanzaProcess'>
    >>> a_pipeline.processes[1]
    <class 'cltk.embeddings.processes.GothicEmbeddingsProcess'>
    """

    description: str = "Pipeline for the Gothic language"
    language: Language = get_lang("got")
    processes: List[Type[Process]] = field(
        default_factory=lambda: [GothicStanzaProcess, GothicEmbeddingsProcess])
コード例 #19
0
ファイル: pipelines.py プロジェクト: free-variation/cltk
class MiddleFrenchPipeline(Pipeline):
    """Default ``Pipeline`` for Middle French.

    TODO: Figure out whether this the dedicated tokenizer is good enough or necessary; we have stanza for Old French, which might be able to tokenizer fine.

    >>> from cltk.languages.pipelines import MiddleFrenchPipeline
    >>> a_pipeline = MiddleFrenchPipeline()
    >>> a_pipeline.description
    'Pipeline for the Middle French language'
    >>> a_pipeline.language
    Language(name='Middle French', glottolog_id='midd1316', latitude=0.0, longitude=0.0, dates=[], family_id='indo1319', parent_id='stan1290', level='dialect', iso_639_3_code='frm', type='h')
    >>> a_pipeline.language.name
    'Middle French'
    >>> a_pipeline.processes[0]
    <class 'cltk.tokenizers.processes.MiddleFrenchTokenizationProcess'>
    """

    description: str = "Pipeline for the Middle French language"
    language: Language = get_lang("frm")
    processes: List[Type[Process]] = field(
        default_factory=lambda: [MiddleFrenchTokenizationProcess])
コード例 #20
0
ファイル: pipelines.py プロジェクト: free-variation/cltk
class PaliPipeline(Pipeline):
    """Default ``Pipeline`` for Pali.

    TODO: Make better tokenizer for Pali.

    >>> from cltk.languages.pipelines import PaliPipeline
    >>> a_pipeline = PaliPipeline()
    >>> a_pipeline.description
    'Pipeline for the Pali language'
    >>> a_pipeline.language
    Language(name='Pali', glottolog_id='pali1273', latitude=24.5271, longitude=82.251, dates=[], family_id='indo1319', parent_id='biha1245', level='language', iso_639_3_code='pli', type='a')
    >>> a_pipeline.language.name
    'Pali'
    >>> a_pipeline.processes[0]
    <class 'cltk.tokenizers.processes.MultilingualTokenizationProcess'>
    """

    description: str = "Pipeline for the Pali language"
    language: Language = get_lang("pli")
    processes: List[Type[Process]] = field(
        default_factory=lambda:
        [MultilingualTokenizationProcess, PaliEmbeddingsProcess])
コード例 #21
0
ファイル: pipelines.py プロジェクト: free-variation/cltk
class ArabicPipeline(Pipeline):
    """Default ``Pipeline`` for Arabic.

    >>> from cltk.languages.pipelines import ArabicPipeline
    >>> a_pipeline = ArabicPipeline()
    >>> a_pipeline.description
    'Pipeline for the Arabic language'
    >>> a_pipeline.language
    Language(name='Standard Arabic', glottolog_id='stan1318', latitude=27.9625, longitude=43.8525, dates=[], family_id='afro1255', parent_id='arab1395', level='language', iso_639_3_code='arb', type='')
    >>> a_pipeline.language.name
    'Standard Arabic'
    >>> a_pipeline.processes[0]
    <class 'cltk.tokenizers.processes.ArabicTokenizationProcess'>
    """

    description: str = "Pipeline for the Arabic language"
    language: Language = get_lang("arb")
    processes: List[Type[Process]] = field(default_factory=lambda: [
        ArabicTokenizationProcess,
        ArabicEmbeddingsProcess,
        StopsProcess,
    ])
コード例 #22
0
ファイル: pipelines.py プロジェクト: free-variation/cltk
class OldFrenchPipeline(Pipeline):
    """Default ``Pipeline`` for Old French.

    >>> from cltk.languages.pipelines import OldFrenchPipeline
    >>> a_pipeline = OldFrenchPipeline()
    >>> a_pipeline.description
    'Pipeline for the Old French language'
    >>> a_pipeline.language
    Language(name='Old French (842-ca. 1400)', glottolog_id='oldf1239', latitude=0.0, longitude=0.0, dates=[], family_id='indo1319', parent_id='oila1234', level='language', iso_639_3_code='fro', type='h')
    >>> a_pipeline.language.name
    'Old French (842-ca. 1400)'
    >>> a_pipeline.processes[0]
    <class 'cltk.dependency.processes.OldFrenchStanzaProcess'>
    """

    description: str = "Pipeline for the Old French language"
    language: Language = get_lang("fro")
    processes: List[Type[Process]] = field(default_factory=lambda: [
        # OldFrenchTokenizationProcess,
        OldFrenchStanzaProcess,
        StopsProcess,
        OldFrenchNERProcess,
    ])
コード例 #23
0
ファイル: pipelines.py プロジェクト: kylepjohnson/cltkv1
class OldEnglishPipeline(Pipeline):
    """Default ``Pipeline`` for Old English.

    >>> from cltk.languages.pipelines import OldEnglishPipeline
    >>> a_pipeline = OldEnglishPipeline()
    >>> a_pipeline.description
    'Pipeline for the Old English language'
    >>> a_pipeline.language
    Language(name='Old English (ca. 450-1100)', glottolog_id='olde1238', latitude=51.06, longitude=-1.31, dates=[], family_id='indo1319', parent_id='angl1265', level='language', iso_639_3_code='ang', type='h')
    >>> a_pipeline.language.name
    'Old English (ca. 450-1100)'
    >>> a_pipeline.processes[0]
    <class 'cltk.tokenizers.processes.MultilingualTokenizationProcess'>
    """

    description: str = "Pipeline for the Old English language"
    language: Language = get_lang("ang")
    processes: List[Type[Process]] = field(default_factory=lambda: [
        MultilingualTokenizationProcess,
        OldEnglishLemmatizationProcess,
        OldEnglishEmbeddingsProcess,
        StopsProcess,
    ])
コード例 #24
0
ファイル: pipelines.py プロジェクト: free-variation/cltk
class AramaicPipeline(Pipeline):
    """Default ``Pipeline`` for Aramaic.

    TODO: Confirm with specialist what encodings should be expected.
    TODO: Replace ``ArabicTokenizationProcess`` with a multilingual one or a specific Aramaic.

    >>> from cltk.languages.pipelines import AramaicPipeline
    >>> a_pipeline = AramaicPipeline()
    >>> a_pipeline.description
    'Pipeline for the Aramaic language'
    >>> a_pipeline.language
    Language(name='Official Aramaic (700-300 BCE)', glottolog_id='', latitude=0.0, longitude=0.0, dates=[], family_id='', parent_id='', level='', iso_639_3_code='arc', type='a')
    >>> a_pipeline.language.name
    'Official Aramaic (700-300 BCE)'
    >>> a_pipeline.processes[0]
    <class 'cltk.tokenizers.processes.ArabicTokenizationProcess'>
    """

    description: str = "Pipeline for the Aramaic language"
    language: Language = get_lang("arc")
    processes: List[Type[Process]] = field(default_factory=lambda: [
        ArabicTokenizationProcess,  # Note: Using Arabic tokenizer for Aramaic. Is this OK?
        AramaicEmbeddingsProcess,
    ])
コード例 #25
0
ファイル: pipelines.py プロジェクト: free-variation/cltk
class SanskritPipeline(Pipeline):
    """Default ``Pipeline`` for Sanskrit.

    TODO: Make better tokenizer for Sanskrit.

    >>> from cltk.languages.pipelines import SanskritPipeline
    >>> a_pipeline = SanskritPipeline()
    >>> a_pipeline.description
    'Pipeline for the Sanskrit language.'
    >>> a_pipeline.language
    Language(name='Sanskrit', glottolog_id='sans1269', latitude=20.0, longitude=77.0, dates=[], family_id='indo1319', parent_id='indo1321', level='language', iso_639_3_code='san', type='a')
    >>> a_pipeline.language.name
    'Sanskrit'
    >>> a_pipeline.processes[1]
    <class 'cltk.embeddings.processes.SanskritEmbeddingsProcess'>
    """

    description: str = "Pipeline for the Sanskrit language."
    language: Language = get_lang("san")
    processes: List[Type[Process]] = field(default_factory=lambda: [
        MultilingualTokenizationProcess,
        SanskritEmbeddingsProcess,
        StopsProcess,
    ])
コード例 #26
0
ファイル: words.py プロジェクト: yelircaasi/cltk
 def __init__(self, iso_code: str):
     self.iso_code = iso_code
     get_lang(iso_code=self.iso_code)
     self.stops = self.get_stopwords()
コード例 #27
0
def tag_ner(iso_code: str, input_tokens: List[str]) -> List[Union[bool, str]]:
    """Run NER for chosen language. Some languages return boolean True/False,
    others give string of entity type (e.g., ``LOC``).

    >>> from cltk.ner.ner import tag_ner
    >>> from cltk.languages.example_texts import get_example_text
    >>> from boltons.strutils import split_punct_ws
    >>> tokens = split_punct_ws(get_example_text(iso_code="lat"))

    >>> text = "ἐπὶ δ᾽ οὖν τοῖς πρώτοις τοῖσδε Περικλῆς ὁ Ξανθίππου ᾑρέθη λέγειν. καὶ ἐπειδὴ καιρὸς ἐλάμβανε, προελθὼν ἀπὸ τοῦ σήματος ἐπὶ βῆμα ὑψηλὸν πεποιημένον, ὅπως ἀκούοιτο ὡς ἐπὶ πλεῖστον τοῦ ὁμίλου, ἔλεγε τοιάδε."
    >>> tokens = split_punct_ws(text)
    >>> are_words_entities = tag_ner(iso_code="grc", input_tokens=tokens)
    >>> tokens[:9]
    ['ἐπὶ', 'δ᾽', 'οὖν', 'τοῖς', 'πρώτοις', 'τοῖσδε', 'Περικλῆς', 'ὁ', 'Ξανθίππου']
    >>> are_words_entities[:9] # TODO check this result
    [False, False, False, False, False, False, False, False, False]

    >>> tokens = split_punct_ws(get_example_text(iso_code="fro"))
    >>> are_words_entities = tag_ner(iso_code="fro", input_tokens=tokens)
    >>> tokens[30:50]
    ['Bretaigne', 'A', 'I', 'molt', 'riche', 'chevalier', 'Hardi', 'et', 'coragous', 'et', 'fier', 'De', 'la', 'Table', 'Reonde', 'estoit', 'Le', 'roi', 'Artu', 'que']
    >>> are_words_entities[30:50]
    ['LOC', False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, 'CHI']
    """

    get_lang(iso_code=iso_code)
    if iso_code not in NER_DICT:
        msg = f"NER unavailable for language ``{iso_code}``."
        raise UnimplementedAlgorithmError(msg)
    ner_file_path = os.path.expanduser(NER_DICT[iso_code])
    if iso_code == "fro":
        if not os.path.isfile(ner_file_path):
            msg = f"Old French model path '{ner_file_path}' not found. Going to try to download it ..."
            logging.warning(msg)
            dl_msg = f"This part of the CLTK depends upon models from the CLTK project."
            model_url = "https://github.com/cltk/fro_models_cltk"
            download_prompt(iso_code=iso_code,
                            message=dl_msg,
                            model_url=model_url)
        loader = importlib.machinery.SourceFileLoader("entities",
                                                      ner_file_path)
        module = loader.load_module()  # type: module
        entities = module.entities  # type: Tuple(str, str)
        entities_type_list = list()
        for input_token in input_tokens:
            for entity_token, kind in entities:
                if input_token == entity_token:
                    entities_type_list.append(kind)
                    break
            entities_type_list.append(False)
        return entities_type_list
    elif iso_code in ["ang", "grc", "lat"]:
        return spacy_tag_ner(iso_code=iso_code,
                             text_tokens=input_tokens,
                             model_path=NER_DICT[iso_code])  # List[str, None]
    else:
        with open(ner_file_path) as file_open:
            ner_str = file_open.read()
        ner_list = ner_str.split("\n")
        is_entity_list = list()  # type: List[bool]
        for word_token in input_tokens:
            if word_token in ner_list:
                is_entity_list.append(True)
            else:
                is_entity_list.append(False)
        return is_entity_list