def tag_ner(iso_code: str, input_tokens: List[str]) -> List[Union[bool, str]]: """Run NER for chosen language. Some languages return boolean True/False, others give string of entity type (e.g., ``LOC``). >>> from cltkv1.ner.ner import tag_ner >>> from cltkv1.languages.example_texts import get_example_text >>> from boltons.strutils import split_punct_ws >>> tokens = split_punct_ws(get_example_text(iso_code="lat")) >>> are_words_entities = tag_ner(iso_code="lat", input_tokens=tokens) >>> tokens[:5] ['Gallia', 'est', 'omnis', 'divisa', 'in'] >>> are_words_entities[:5] [True, False, False, False, False] >>> text = "ἐπὶ δ᾽ οὖν τοῖς πρώτοις τοῖσδε Περικλῆς ὁ Ξανθίππου ᾑρέθη λέγειν. καὶ ἐπειδὴ καιρὸς ἐλάμβανε, προελθὼν ἀπὸ τοῦ σήματος ἐπὶ βῆμα ὑψηλὸν πεποιημένον, ὅπως ἀκούοιτο ὡς ἐπὶ πλεῖστον τοῦ ὁμίλου, ἔλεγε τοιάδε." >>> tokens = split_punct_ws(text) >>> are_words_entities = tag_ner(iso_code="grc", input_tokens=tokens) >>> tokens[:9] ['ἐπὶ', 'δ᾽', 'οὖν', 'τοῖς', 'πρώτοις', 'τοῖσδε', 'Περικλῆς', 'ὁ', 'Ξανθίππου'] >>> are_words_entities[:9] [False, False, False, False, False, False, True, False, True] >>> tokens = split_punct_ws(get_example_text(iso_code="fro")) >>> are_words_entities = tag_ner(iso_code="fro", input_tokens=tokens) >>> tokens[30:50] ['Bretaigne', 'A', 'I', 'molt', 'riche', 'chevalier', 'Hardi', 'et', 'coragous', 'et', 'fier', 'De', 'la', 'Table', 'Reonde', 'estoit', 'Le', 'roi', 'Artu', 'que'] >>> are_words_entities[30:50] ['LOC', False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, 'CHI'] """ get_lang(iso_code=iso_code) if iso_code not in NER_DICT: msg = f"NER unavailable for language ``{iso_code}``." raise UnimplementedAlgorithmError(msg) ner_file_path = os.path.expanduser(NER_DICT[iso_code]) if iso_code == "fro": loader = importlib.machinery.SourceFileLoader("entities", ner_file_path) module = loader.load_module() # type: module entities = module.entities # type: Tuple(str, str) entities_type_list = list() for input_token in input_tokens: for entity_token, kind in entities: if input_token == entity_token: entities_type_list.append(kind) break entities_type_list.append(False) return entities_type_list else: with open(ner_file_path) as file_open: ner_str = file_open.read() ner_list = ner_str.split("\n") is_entity_list = list() # type: List[bool] for word_token in input_tokens: if word_token in ner_list: is_entity_list.append(True) else: is_entity_list.append(False) return is_entity_list
def _is_fasttext_lang_available(self) -> bool: """Returns whether any vectors are available, for fastText, for the input language. This is not comprehensive of all fastText embeddings, only those added into the CLTK. """ get_lang(iso_code=self.iso_code) if self.iso_code not in self.MAP_LANGS_CLTK_FASTTEXT: return False else: return True
def _check_input_params(self): """Look at combination of parameters give to class and determine if any invalid combination or missing models. """ # 1. check if lang valid get_lang(self.iso_code) # check if iso_code valid # 2. check if any fasttext embeddings for this lang if not self._is_fasttext_lang_available(): available_embeddings_str = "', '".join( self.MAP_LANGS_CLTK_FASTTEXT.keys()) raise UnimplementedAlgorithmError( f"No embedding available for language '{self.iso_code}'. FastTextEmbeddings available for: '{available_embeddings_str}'." ) # 3. check if requested model type is available for fasttext valid_model_types = ["bin", "vec"] if self.model_type not in valid_model_types: valid_model_types_str = "', '" raise CLTKException( f"Invalid model type '{self.model_type}'. Choose: '{valid_model_types_str}'." ) # 4. check if requested training set is available for language for fasttext training_sets = ["common_crawl", "wiki"] if self.training_set not in training_sets: training_sets_str = "', '".join(training_sets) raise CLTKException( f"Invalid ``training_set`` '{self.training_set}'. Available: '{training_sets_str}'." ) available_vectors = list() if self.training_set == "wiki": available_vectors = [ "ang", "arb", "arc", "got", "lat", "pli", "san" ] elif self.training_set == "common_crawl": available_vectors = ["arb", "lat", "san"] else: CLTKException("Unanticipated exception.") if self.iso_code in available_vectors: pass else: available_vectors_str = "', '".join(available_vectors) raise CLTKException( f"Training set '{self.training_set}' not available for language '{self.iso_code}'. Languages available for this training set: '{available_vectors_str}'." )
class LatinPipeline(Pipeline): """Default ``Pipeline`` for Latin. TODO: Add stopword annotation for all relevant pipelines. >>> from cltkv1.languages.pipelines import LatinPipeline >>> a_pipeline = LatinPipeline() >>> a_pipeline.description 'Pipeline for the Latin language' >>> a_pipeline.language Language(name='Latin', glottolog_id='lati1261', latitude=41.9026, longitude=12.4502, dates=[], family_id='indo1319', parent_id='impe1234', level='language', iso_639_3_code='lat', type='a') >>> a_pipeline.language.name 'Latin' >>> a_pipeline.processes[0] <class 'cltkv1.dependency.processes.LatinStanzaProcess'> """ description: str = "Pipeline for the Latin language" language: Language = get_lang("lat") processes: List[Type[Process]] = field(default_factory=lambda: [ # LatinTokenizationProcess, LatinStanzaProcess, LatinEmbeddingsProcess, StopsProcess, LatinNERProcess, ])
def __init__(self, language: str, custom_pipeline: Pipeline = None) -> None: """Constructor for CLTK class. Args: language: ISO code custom_pipeline: Optional ``Pipeline`` for processing text. >>> from cltkv1 import NLP >>> cltk_nlp = NLP(language="lat") >>> isinstance(cltk_nlp, NLP) True >>> from cltkv1.core.data_types import Pipeline >>> from cltkv1.tokenizers import LatinTokenizationProcess >>> from cltkv1.languages.utils import get_lang >>> a_pipeline = Pipeline(description="A custom Latin pipeline", processes=[LatinTokenizationProcess], language=get_lang("lat")) >>> nlp = NLP(language="lat", custom_pipeline=a_pipeline) >>> nlp.pipeline is a_pipeline True """ self.language = get_lang(language) # type: Language self.pipeline = custom_pipeline if custom_pipeline else self._get_pipeline( )
def _check_input_params(self) -> None: """Confirm that input parameters are valid and in a valid configuration. """ # 1. check if lang valid get_lang(self.iso_code) # check if iso_code valid # 2. check if any fasttext embeddings for this lang if self.iso_code not in self.MAP_LANG_TO_URL: available_embeddings_str = "', '".join(self.MAP_LANG_TO_URL.keys()) raise UnimplementedAlgorithmError( f"No embedding available for language '{self.iso_code}'. Word2Vec models available for: '{available_embeddings_str}'." ) # 3. assert that model type is valid valid_types = ["bin", "txt"] if self.model_type not in valid_types: unavailable_types_str = "', '".join(valid_types) raise ValueError( f"Invalid ``model_type`` {self.model_type}. Valid model types: {unavailable_types_str}." )
def get_example_text(iso_code: str) -> str: """Take in search term of usual language name and find ISO code. >>> from cltkv1.languages.example_texts import get_example_text >>> get_example_text("got")[:25] 'swa liuhtjai liuhaþ izwar' >>> get_example_text("zkz") Traceback (most recent call last): ... cltkv1.core.exceptions.UnimplementedAlgorithmError: Example text unavailable for ISO 639-3 code 'zkz'. >>> get_example_text("xxx") Traceback (most recent call last): ... cltkv1.core.exceptions.UnknownLanguageError: Unknown ISO language code 'xxx'. """ get_lang(iso_code=iso_code) try: return EXAMPLE_TEXTS[iso_code] except KeyError: raise UnimplementedAlgorithmError( f"Example text unavailable for ISO 639-3 code '{iso_code}'.")
def _is_fasttext_lang_available(self) -> bool: """Returns whether any vectors are available, for fastText, for the input language. This is not comprehensive of all fastText embeddings, only those added into the CLTK. # >>> from cltkv1.embeddings.embeddings import FastTextEmbeddings # >>> embeddings_obj = FastTextEmbeddings(iso_code="lat", silent=True) # >>> embeddings_obj._is_fasttext_lang_available() # True # >>> embeddings_obj = FastTextEmbeddings(iso_code="ave, silent=True") # Traceback (most recent call last): # .. # cltkv1.core.exceptions.UnimplementedAlgorithmError: No embedding available for language 'ave'. FastTextEmbeddings available for: 'ang', 'arb', 'arc', 'got', 'lat', 'pli', 'san'. # >>> embeddings_obj = FastTextEmbeddings(iso_code="xxx", silent=True) # Traceback (most recent call last): # .. # cltkv1.core.exceptions.UnknownLanguageError """ get_lang(iso_code=self.iso_code) if self.iso_code not in self.MAP_LANGS_CLTK_FASTTEXT: return False else: return True
class HindiPipeline(Pipeline): """Default ``Pipeline`` for Hindi. >>> from cltkv1.languages.pipelines import SanskritPipeline >>> a_pipeline = HindiPipeline() >>> a_pipeline.description 'Pipeline for the Hindi language.' >>> a_pipeline.language Language(name='Hindi', glottolog_id='hind1269', latitude=25.0, longitude=77.0, dates=[], family_id='indo1319', parent_id='hind1270', level='language', iso_639_3_code='hin', type='') >>> a_pipeline.language.name 'Hindi' >>> a_pipeline.processes[0] <class 'cltkv1.tokenizers.processes.SanskritTokenizationProcess'> """ description: str = "Pipeline for the Hindi language." language: Language = get_lang("hin") processes: List[Type[Process]] = field( default_factory=lambda: [SanskritTokenizationProcess, StopsProcess])
class CopticPipeline(Pipeline): """Default ``Pipeline`` for Coptic. >>> from cltkv1.languages.pipelines import CopticPipeline >>> a_pipeline = CopticPipeline() >>> a_pipeline.description 'Pipeline for the Coptic language' >>> a_pipeline.language Language(name='Coptic', glottolog_id='copt1239', latitude=29.472, longitude=31.2053, dates=[], family_id='afro1255', parent_id='egyp1245', level='language', iso_639_3_code='cop', type='') >>> a_pipeline.language.name 'Coptic' >>> a_pipeline.processes[0] <class 'cltkv1.dependency.processes.CopticStanzaProcess'> """ description: str = "Pipeline for the Coptic language" language: Language = get_lang("cop") processes: List[Type[Process]] = field( default_factory=lambda: [CopticStanzaProcess])
class ChinesePipeline(Pipeline): """Default ``Pipeline`` for Classical Chinese. >>> from cltkv1.languages.pipelines import ChinesePipeline >>> a_pipeline = ChinesePipeline() >>> a_pipeline.description 'Pipeline for the Classical Chinese language' >>> a_pipeline.language Language(name='Literary Chinese', glottolog_id='lite1248', latitude=0.0, longitude=0.0, dates=[], family_id='sino1245', parent_id='clas1255', level='language', iso_639_3_code='lzh', type='h') >>> a_pipeline.language.name 'Literary Chinese' >>> a_pipeline.processes[0] <class 'cltkv1.dependency.processes.ChineseStanzaProcess'> """ description: str = "Pipeline for the Classical Chinese language" language: Language = get_lang("lzh") processes: List[Type[Process]] = field( default_factory=lambda: [ChineseStanzaProcess])
class OCSPipeline(Pipeline): """Default ``Pipeline`` for Old Church Slavonic. >>> from cltkv1.languages.pipelines import OCSPipeline >>> a_pipeline = OCSPipeline() >>> a_pipeline.description 'Pipeline for the Old Church Slavonic language' >>> a_pipeline.language Language(name='Church Slavic', glottolog_id='chur1257', latitude=43.7171, longitude=22.8442, dates=[], family_id='indo1319', parent_id='east2269', level='language', iso_639_3_code='chu', type='a') >>> a_pipeline.language.name 'Church Slavic' >>> a_pipeline.processes[0] <class 'cltkv1.dependency.processes.OCSStanzaProcess'> """ description: str = "Pipeline for the Old Church Slavonic language" language: Language = get_lang("chu") processes: List[Type[Process]] = field( default_factory=lambda: [OCSStanzaProcess])
class AkkadianPipeline(Pipeline): """Default ``Pipeline`` for Akkadian. >>> from cltkv1.languages.pipelines import AkkadianPipeline >>> a_pipeline = AkkadianPipeline() >>> a_pipeline.description 'Pipeline for the Akkadian language.' >>> a_pipeline.language Language(name='Akkadian', glottolog_id='akka1240', latitude=33.1, longitude=44.1, dates=[], family_id='afro1255', parent_id='east2678', level='language', iso_639_3_code='akk', type='a') >>> a_pipeline.language.name 'Akkadian' >>> a_pipeline.processes[0] <class 'cltkv1.tokenizers.processes.AkkadianTokenizationProcess'> """ description: str = "Pipeline for the Akkadian language." language: Language = get_lang("akk") processes: List[Type[Process]] = field( default_factory=lambda: [AkkadianTokenizationProcess, StopsProcess])
class PanjabiPipeline(Pipeline): """Default ``Pipeline`` for Panjabi. >>> from cltkv1.languages.pipelines import SanskritPipeline >>> a_pipeline = PanjabiPipeline() >>> a_pipeline.description 'Pipeline for the Panjabi language.' >>> a_pipeline.language Language(name='Eastern Panjabi', glottolog_id='panj125', latitude=30.0368, longitude=75.6702, dates=[], family_id='indo1319', parent_id='east2727', level='language', iso_639_3_code='pan', type='') >>> a_pipeline.language.name 'Eastern Panjabi' >>> a_pipeline.processes[0] <class 'cltkv1.tokenizers.processes.SanskritTokenizationProcess'> """ description: str = "Pipeline for the Panjabi language." language: Language = get_lang("pan") processes: List[Type[Process]] = field( default_factory=lambda: [SanskritTokenizationProcess, StopsProcess])
class MHGPipeline(Pipeline): """Default ``Pipeline`` for Middle High German. >>> from cltkv1.languages.pipelines import MHGPipeline >>> a_pipeline = MHGPipeline() >>> a_pipeline.description 'Pipeline for the Middle High German language.' >>> a_pipeline.language Language(name='Middle High German', glottolog_id='midd1343', latitude=0.0, longitude=0.0, dates=[], family_id='indo1319', parent_id='midd1349', level='language', iso_639_3_code='gmh', type='h') >>> a_pipeline.language.name 'Middle High German' >>> a_pipeline.processes[0] <class 'cltkv1.tokenizers.processes.MHGTokenizationProcess'> """ description: str = "Pipeline for the Middle High German language." language: Language = get_lang("gmh") processes: List[Type[Process]] = field( default_factory=lambda: [MHGTokenizationProcess, StopsProcess])
class OldNorsePipeline(Pipeline): """Default ``Pipeline`` for Old Norse. >>> from cltkv1.languages.pipelines import OldNorsePipeline >>> a_pipeline = OldNorsePipeline() >>> a_pipeline.description 'Pipeline for the Old Norse language' >>> a_pipeline.language Language(name='Old Norse', glottolog_id='oldn1244', latitude=63.42, longitude=10.38, dates=[], family_id='indo1319', parent_id='west2805', level='language', iso_639_3_code='non', type='h') >>> a_pipeline.language.name 'Old Norse' >>> a_pipeline.processes[0] <class 'cltkv1.tokenizers.processes.OldNorseTokenizationProcess'> """ description: str = "Pipeline for the Old Norse language" language: Language = get_lang("non") processes: List[Type[Process]] = field( default_factory=lambda: [OldNorseTokenizationProcess, StopsProcess])
class GothicPipeline(Pipeline): """Default ``Pipeline`` for Gothic. >>> from cltkv1.languages.pipelines import GothicPipeline >>> a_pipeline = GothicPipeline() >>> a_pipeline.description 'Pipeline for the Gothic language' >>> a_pipeline.language Language(name='Gothic', glottolog_id='goth1244', latitude=46.9304, longitude=29.9786, dates=[], family_id='indo1319', parent_id='east2805', level='language', iso_639_3_code='got', type='a') >>> a_pipeline.language.name 'Gothic' >>> a_pipeline.processes[0] <class 'cltkv1.dependency.processes.GothicStanzaProcess'> >>> a_pipeline.processes[1] <class 'cltkv1.embeddings.processes.GothicEmbeddingsProcess'> """ description: str = "Pipeline for the Gothic language" language: Language = get_lang("got") processes: List[Type[Process]] = field( default_factory=lambda: [GothicStanzaProcess, GothicEmbeddingsProcess])
class MiddleFrenchPipeline(Pipeline): """Default ``Pipeline`` for Middle French. TODO: Figure out whether this the dedicated tokenizer is good enough or necessary; we have stanza for Old French, which might be able to tokenizer fine. >>> from cltkv1.languages.pipelines import MiddleFrenchPipeline >>> a_pipeline = MiddleFrenchPipeline() >>> a_pipeline.description 'Pipeline for the Middle French language' >>> a_pipeline.language Language(name='Middle French', glottolog_id='midd1316', latitude=0.0, longitude=0.0, dates=[], family_id='indo1319', parent_id='stan1290', level='dialect', iso_639_3_code='frm', type='h') >>> a_pipeline.language.name 'Middle French' >>> a_pipeline.processes[0] <class 'cltkv1.tokenizers.processes.MiddleFrenchTokenizationProcess'> """ description: str = "Pipeline for the Middle French language" language: Language = get_lang("frm") processes: List[Type[Process]] = field( default_factory=lambda: [MiddleFrenchTokenizationProcess])
class PaliPipeline(Pipeline): """Default ``Pipeline`` for Pali. TODO: Make better tokenizer for Pali. >>> from cltkv1.languages.pipelines import PaliPipeline >>> a_pipeline = PaliPipeline() >>> a_pipeline.description 'Pipeline for the Pali language' >>> a_pipeline.language Language(name='Pali', glottolog_id='pali1273', latitude=24.5271, longitude=82.251, dates=[], family_id='indo1319', parent_id='biha1245', level='language', iso_639_3_code='pli', type='a') >>> a_pipeline.language.name 'Pali' >>> a_pipeline.processes[0] <class 'cltkv1.tokenizers.processes.MultilingualTokenizationProcess'> """ description: str = "Pipeline for the Pali language" language: Language = get_lang("pli") processes: List[Type[Process]] = field( default_factory=lambda: [MultilingualTokenizationProcess, PaliEmbeddingsProcess])
class ArabicPipeline(Pipeline): """Default ``Pipeline`` for Arabic. >>> from cltkv1.languages.pipelines import ArabicPipeline >>> a_pipeline = ArabicPipeline() >>> a_pipeline.description 'Pipeline for the Arabic language' >>> a_pipeline.language Language(name='Standard Arabic', glottolog_id='stan1318', latitude=27.9625, longitude=43.8525, dates=[], family_id='afro1255', parent_id='arab1395', level='language', iso_639_3_code='arb', type='') >>> a_pipeline.language.name 'Standard Arabic' >>> a_pipeline.processes[0] <class 'cltkv1.tokenizers.processes.ArabicTokenizationProcess'> """ description: str = "Pipeline for the Arabic language" language: Language = get_lang("arb") processes: List[Type[Process]] = field(default_factory=lambda: [ ArabicTokenizationProcess, ArabicEmbeddingsProcess, StopsProcess, ])
class OldEnglishPipeline(Pipeline): """Default ``Pipeline`` for Old English. >>> from cltkv1.languages.pipelines import OldEnglishPipeline >>> a_pipeline = OldEnglishPipeline() >>> a_pipeline.description 'Pipeline for the Old English language' >>> a_pipeline.language Language(name='Old English (ca. 450-1100)', glottolog_id='olde1238', latitude=51.06, longitude=-1.31, dates=[], family_id='indo1319', parent_id='angl1265', level='language', iso_639_3_code='ang', type='h') >>> a_pipeline.language.name 'Old English (ca. 450-1100)' >>> a_pipeline.processes[0] <class 'cltkv1.tokenizers.processes.MultilingualTokenizationProcess'> """ description: str = "Pipeline for the Old English language" language: Language = get_lang("ang") processes: List[Type[Process]] = field(default_factory=lambda: [ MultilingualTokenizationProcess, OldEnglishEmbeddingsProcess, StopsProcess, ])
class OldFrenchPipeline(Pipeline): """Default ``Pipeline`` for Old French. >>> from cltkv1.languages.pipelines import OldFrenchPipeline >>> a_pipeline = OldFrenchPipeline() >>> a_pipeline.description 'Pipeline for the Old French language' >>> a_pipeline.language Language(name='Old French (842-ca. 1400)', glottolog_id='oldf1239', latitude=0.0, longitude=0.0, dates=[], family_id='indo1319', parent_id='oila1234', level='language', iso_639_3_code='fro', type='h') >>> a_pipeline.language.name 'Old French (842-ca. 1400)' >>> a_pipeline.processes[0] <class 'cltkv1.dependency.processes.OldFrenchStanzaProcess'> """ description: str = "Pipeline for the Old French language" language: Language = get_lang("fro") processes: List[Type[Process]] = field(default_factory=lambda: [ # OldFrenchTokenizationProcess, OldFrenchStanzaProcess, StopsProcess, OldFrenchNERProcess, ])
class GreekPipeline(Pipeline): """Default ``Pipeline`` for Ancient Greek. >>> from cltkv1.languages.pipelines import GreekPipeline >>> a_pipeline = GreekPipeline() >>> a_pipeline.description 'Pipeline for the Greek language' >>> a_pipeline.language Language(name='Ancient Greek', glottolog_id='anci1242', latitude=39.8155, longitude=21.9129, dates=[], family_id='indo1319', parent_id='east2798', level='language', iso_639_3_code='grc', type='h') >>> a_pipeline.language.name 'Ancient Greek' >>> a_pipeline.processes[0] <class 'cltkv1.dependency.processes.GreekStanzaProcess'> """ description: str = "Pipeline for the Greek language" language: Language = get_lang("grc") processes: List[Type[Process]] = field(default_factory=lambda: [ # GreekTokenizationProcess, GreekStanzaProcess, GreekEmbeddingsProcess, StopsProcess, GreekNERProcess, ])
class SanskritPipeline(Pipeline): """Default ``Pipeline`` for Sanskrit. TODO: Make better tokenizer for Sanskrit. >>> from cltkv1.languages.pipelines import SanskritPipeline >>> a_pipeline = SanskritPipeline() >>> a_pipeline.description 'Pipeline for the Sanskrit language.' >>> a_pipeline.language Language(name='Sanskrit', glottolog_id='sans1269', latitude=20.0, longitude=77.0, dates=[], family_id='indo1319', parent_id='indo1321', level='language', iso_639_3_code='san', type='a') >>> a_pipeline.language.name 'Sanskrit' >>> a_pipeline.processes[0] <class 'cltkv1.tokenizers.processes.SanskritTokenizationProcess'> """ description: str = "Pipeline for the Sanskrit language." language: Language = get_lang("san") processes: List[Type[Process]] = field(default_factory=lambda: [ SanskritTokenizationProcess, SanskritEmbeddingsProcess, StopsProcess, ])
class AramaicPipeline(Pipeline): """Default ``Pipeline`` for Aramaic. TODO: Confirm with specialist what encodings should be expected. TODO: Replace ``ArabicTokenizationProcess`` with a multilingual one or a specific Aramaic. >>> from cltkv1.languages.pipelines import AramaicPipeline >>> a_pipeline = AramaicPipeline() >>> a_pipeline.description 'Pipeline for the Aramaic language' >>> a_pipeline.language Language(name='Official Aramaic (700-300 BCE)', glottolog_id='', latitude=0.0, longitude=0.0, dates=[], family_id='', parent_id='', level='', iso_639_3_code='arc', type='a') >>> a_pipeline.language.name 'Official Aramaic (700-300 BCE)' >>> a_pipeline.processes[0] <class 'cltkv1.tokenizers.processes.ArabicTokenizationProcess'> """ description: str = "Pipeline for the Aramaic language" language: Language = get_lang("arc") processes: List[Type[Process]] = field(default_factory=lambda: [ ArabicTokenizationProcess, # Note: Using Arabic tokenizer for Aramaic. Is this OK? AramaicEmbeddingsProcess, ])
def __init__(self, iso_code: str): self.iso_code = iso_code get_lang(iso_code=self.iso_code) self.stops = self.get_stopwords()
def _check_input_params(self): """Look at combination of parameters give to class and determine if any invalid combination or missing models. >>> from cltkv1.embeddings.embeddings import FastTextEmbeddings >>> fasttext_model = FastTextEmbeddings(iso_code="lat", interactive=False, overwrite=False, silent=True) >>> type(fasttext_model) <class 'cltkv1.embeddings.embeddings.FastTextEmbeddings'> >>> fasttext_model = FastTextEmbeddings(iso_code="ave", interactive=False, overwrite=False, silent=True) # doctest: +ELLIPSIS Traceback (most recent call last): .. cltkv1.core.exceptions.UnimplementedAlgorithmError: No embedding available for language 'ave'. FastTextEmbeddings available for: ... >>> fasttext_model = FastTextEmbeddings(iso_code="xxx", interactive=False, overwrite=False, silent=True) Traceback (most recent call last): .. cltkv1.core.exceptions.UnknownLanguageError: Unknown ISO language code 'xxx'. >>> fasttext_model = FastTextEmbeddings(iso_code="got", training_set="wiki", interactive=False, overwrite=False, silent=True) # doctest: +ELLIPSIS >>> type(fasttext_model) <class 'cltkv1.embeddings.embeddings.FastTextEmbeddings'> >>> fasttext_model = FastTextEmbeddings(iso_code="got", training_set="common_crawl", interactive=False, overwrite=False, silent=True) # doctest: +ELLIPSIS Traceback (most recent call last): .. cltkv1.core.exceptions.CLTKException: Training set 'common_crawl' not available for language 'got'. Languages available for this training set: ... TODO: Add tests for ``.bin`` files, too """ # 1. check if lang valid get_lang(self.iso_code) # check if iso_code valid # 2. check if any fasttext embeddings for this lang if not self._is_fasttext_lang_available(): available_embeddings_str = "', '".join( self.MAP_LANGS_CLTK_FASTTEXT.keys()) raise UnimplementedAlgorithmError( f"No embedding available for language '{self.iso_code}'. FastTextEmbeddings available for: '{available_embeddings_str}'." ) # 3. check if requested model type is available for fasttext valid_model_types = ["bin", "vec"] if self.model_type not in valid_model_types: valid_model_types_str = "', '" raise CLTKException( f"Invalid model type '{self.model_type}'. Choose: '{valid_model_types_str}'." ) # 4. check if requested training set is available for language for fasttext training_sets = ["common_crawl", "wiki"] if self.training_set not in training_sets: training_sets_str = "', '".join(training_sets) raise CLTKException( f"Invalid ``training_set`` '{self.training_set}'. Available: '{training_sets_str}'." ) available_vectors = list() if self.training_set == "wiki": available_vectors = [ "ang", "arb", "arc", "got", "lat", "pli", "san" ] elif self.training_set == "common_crawl": available_vectors = ["arb", "lat", "san"] else: CLTKException("Unanticipated exception.") if self.iso_code in available_vectors: pass else: available_vectors_str = "', '".join(available_vectors) raise CLTKException( f"Training set '{self.training_set}' not available for language '{self.iso_code}'. Languages available for this training set: '{available_vectors_str}'." )