def download_fasttext_model(iso_code: str, model_source: str = "wiki", interactive=False) -> None: """Download fasttext model. TODO: Add way to specify a Common Crawl model. """ print(f"Going to download fasttext model for '{iso_code}'.") avail_sources = ["wiki", "common_crawl"] assert ( model_source in avail_sources ), f"Invalid `model_source`. Choose from: {', '.join(avail_sources)}." all_wiki_models = ["ang", "arb", "arc", "got", "lat", "pli", "san"] if model_source == "wiki" and iso_code not in all_wiki_models: raise CLTKException( f"Language '{iso_code}' not available for `model_source` '{model_source}'. Choose from: {', '.join(all_wiki_models)}." ) all_common_crawl_models = ["arb", "lat", "san"] if model_source == "common_crawl" and iso_code not in all_common_crawl_models: raise CLTKException( f"Language '{iso_code}' not available for `model_source` '{model_source}'. Choose from: {', '.join(all_common_crawl_models)}." ) FastTextEmbeddings(iso_code=iso_code, interactive=interactive, overwrite=False, silent=False) print(f"Finished downloading fasttext for '{iso_code}'.")
def _check_input_params(self): """Look at combination of parameters give to class and determine if any invalid combination or missing models. """ # 1. check if lang valid get_lang(self.iso_code) # check if iso_code valid # 2. check if any fasttext embeddings for this lang if not self._is_fasttext_lang_available(): available_embeddings_str = "', '".join( self.MAP_LANGS_CLTK_FASTTEXT.keys()) raise UnimplementedAlgorithmError( f"No embedding available for language '{self.iso_code}'. FastTextEmbeddings available for: '{available_embeddings_str}'." ) # 3. check if requested model type is available for fasttext valid_model_types = ["bin", "vec"] if self.model_type not in valid_model_types: valid_model_types_str = "', '" raise CLTKException( f"Invalid model type '{self.model_type}'. Choose: '{valid_model_types_str}'." ) # 4. check if requested training set is available for language for fasttext training_sets = ["common_crawl", "wiki"] if self.training_set not in training_sets: training_sets_str = "', '".join(training_sets) raise CLTKException( f"Invalid ``training_set`` '{self.training_set}'. Available: '{training_sets_str}'." ) available_vectors = list() if self.training_set == "wiki": available_vectors = [ "ang", "arb", "arc", "got", "lat", "pli", "san" ] elif self.training_set == "common_crawl": available_vectors = ["arb", "lat", "san"] else: CLTKException("Unanticipated exception.") if self.iso_code in available_vectors: pass else: available_vectors_str = "', '".join(available_vectors) raise CLTKException( f"Training set '{self.training_set}' not available for language '{self.iso_code}'. Languages available for this training set: '{available_vectors_str}'." )
def lookup(self, lemma: str) -> str: """Perform match of a lemma against headwords. This is case sensitive. If more than one match, then return the concatenated entries. For example: >>> onzl = OldNorseZoegaLexicon() >>> onzl.lookup("sonr") '(gen. sonar, dat. syni and søni; pl. synir, sønir; ace. sonu and syni), m. son.' """ if not self.entries: raise CLTKException( "No dictionary entries found in the .yaml file. This should never happen." ) if regex.match(r"^[0-9\.\?,\:;\!\<\>\-]*$", lemma) is not None: return "" keys = self.entries.keys() matches = [ key for key in keys if regex.match(rf"^{lemma}[0-9]?$", key) ] n_matches = len(matches) if n_matches > 1: return "\n".join([self.entries[key] for key in matches]) elif n_matches == 1: return self.entries[matches[0]] else: return ""
def download_prompt( iso_code: str, message: str, model_url: str, interactive: bool = True, silent: bool = False, ): """Ask user whether to download files. TODO: Make ft and stanza use this fn. Consider moving to other module. """ fetch_corpus = FetchCorpus(language=iso_code) if not interactive: if not silent: print(message) fetch_corpus.import_corpus(corpus_name=f"{iso_code}_models_cltk") # get_file_with_progress_bar(model_url=model_url, file_path=self.fp_zip) else: print(message) dl_is_allowed = query_yes_no( f"Do you want to download '{model_url}' to '~/cltk_data/{iso_code}'?" ) # type: bool if dl_is_allowed: fetch_corpus.import_corpus(corpus_name=f"{iso_code}_models_cltk") # get_file_with_progress_bar(model_url=model_url, file_path=self.fp_zip) else: raise CLTKException( f"Download of necessary model declined for '{iso_code}'. Following functions will likely fail." )
def spacy_tag_ner(iso_code: str, text_tokens: List[str], model_path: str) -> List[Union[str, bool]]: """Take a list of tokens and return label or None. >>> text_tokens = ["Gallia", "est", "omnis", "divisa", "in", "partes", "tres", ",", "quarum", "unam", "incolunt", "Belgae", ",", "aliam", "Aquitani", ",", "tertiam", "qui", "ipsorum", "lingua", "Celtae", ",", "nostra", "Galli", "appellantur", "."] >>> from cltk.utils import CLTK_DATA_DIR >>> spacy_tag_ner('lat', text_tokens=text_tokens, model_path=os.path.join(CLTK_DATA_DIR, "lat/model/lat_models_cltk/ner/spacy_model/")) ['LOCATION', False, False, False, False, False, False, False, False, False, False, 'LOCATION', False, False, 'LOCATION', False, False, False, False, False, 'LOCATION', False, False, 'LOCATION', False, False] """ # make sure that we have a List[str] if not isinstance(text_tokens[0], str): raise CLTKException("`spacy_tag_ner()` requires `List[str]`.") if not os.path.isdir(model_path): msg = f"spaCy model path '{model_path}' not found. Going to try to download it ..." logging.warning(msg) dl_msg = f"This part of the CLTK depends upon models from the CLTK project." model_url = f"https://github.com/cltk/{iso_code}_models_cltk" download_prompt(iso_code=iso_code, message=dl_msg, model_url=model_url) spacy_nlp = spacy.load(model_path) # Create the tokenizer for the spacy model spacy_nlp.tokenizer = CustomTokenizer(vocab=spacy_nlp.vocab) # Create the spacy Doc Object that contains the metadata for entities spacy_doc = spacy_nlp(text_tokens) # type: Doc # generate the final output token_labels = list() # type: List[Union[str, bool]] for word in spacy_doc: if word.ent_type_: # word.ent_type_ # type: str token_labels.append(word.ent_type_) else: token_labels.append(False) return token_labels
def _download_model(self) -> None: """Interface with the `stanza` model downloader.""" if not self.interactive: if not self.silent: print( f"CLTK message: Going to download required Stanza models to ``{self.model_path}`` ..." ) # pragma: no cover stanza.download(lang=self.stanza_code, package=self.treebank) else: print( # pragma: no cover "CLTK message: This part of the CLTK depends upon the Stanza NLP library." ) # pragma: no cover dl_is_allowed = query_yes_no( f"CLTK message: Allow download of Stanza models to ``{self.model_path}``?" ) # type: bool if dl_is_allowed: stanza.download(lang=self.stanza_code, package=self.treebank) else: raise CLTKException( f"Download of necessary Stanza model declined for '{self.language}'. Unable to continue with Stanza's processing." ) # if file model still not available after attempted DL, then raise error if not file_exists(self.model_path): raise FileNotFoundError( "Missing required models for ``stanza`` at ``{0}``.".format( self.model_path ) )
def algorithm(self): if self.language == "lat": lex_class = LatinLewisLexicon() else: raise CLTKException( f"No lookup algorithm for language '{self.language}'.") return lex_class
def from_ud(feature_name: str, feature_value: str) -> Optional[MorphosyntacticFeature]: """For a given Universal Dependencies feature name and value, return the appropriate feature class/value. >>> from_ud('Case', 'Abl') ablative >>> from_ud('Abbr', 'Yes') pos >>> from_ud('PronType', 'Ind') indefinite """ if feature_name in from_ud_map: feature_map = from_ud_map[feature_name] else: msg = f"{feature_name}: Unrecognized UD feature name" print("From `from_ud():`", msg) # raise CLTKException(msg) return None values = feature_value.split(",") for value in values: if value in feature_map: return feature_map[value] else: raise CLTKException( f"{value}: Unrecognized value for UD feature {feature_name}")
def __getitem__( self, feature_name: Union[str, Type[MorphosyntacticFeature]] ) -> List[MorphosyntacticFeature]: """ Use dict-type syntax for accessing the values of features. >>> f1 = f(F.pos, N.pos) >>> f1[F] [pos] >>> f1[V] Traceback (most recent call last): cltk.core.exceptions.CLTKException: {F: [pos], N: [pos]} unspecified for V >>> f1['F'] [pos] """ if type(feature_name) == str: if feature_name not in globals(): raise TypeError(feature_name + " is not a morphosytactic feature") feature_name = globals()[feature_name] if not issubclass(feature_name, MorphosyntacticFeature): raise TypeError( str(feature_name) + " is not a morphosytactic feature") if feature_name in self.features: return self.features[feature_name] else: raise CLTKException(f"{self} unspecified for {feature_name}")
def download_fasttext_models(self): """Perform complete download of fastText models and save them in appropriate ``cltk_data`` dir. TODO: Add tests TODO: Implement ``overwrite`` TODO: error out better or continue to _load_model? """ model_url = self._build_fasttext_url() if not self.interactive: if not self.silent: print( f"CLTK message: Going to download file '{model_url}' to '{self.model_fp} ..." ) # pragma: no cover get_file_with_progress_bar(model_url=model_url, file_path=self.model_fp) else: print( # pragma: no cover "CLTK message: This part of the CLTK depends upon word embedding models from the Fasttext project." ) # pragma: no cover dl_is_allowed = query_yes_no( f"Do you want to download file '{model_url}' to '{self.model_fp}'?" ) # type: bool if dl_is_allowed: get_file_with_progress_bar(model_url=model_url, file_path=self.model_fp) else: raise CLTKException( f"Download of necessary Stanza model declined for '{self.iso_code}'. Unable to continue with Stanza's processing." )
def _download_nlpl_models(self) -> None: """Perform complete download of Word2Vec models and save them in appropriate ``cltk_data`` dir. """ model_url = self.MAP_LANG_TO_URL[self.iso_code] if not self.interactive: if not self.silent: print( f"CLTK message: Going to download file '{model_url}' to '{self.fp_zip} ..." ) # pragma: no cover get_file_with_progress_bar(model_url=model_url, file_path=self.fp_zip) else: print( # pragma: no cover "CLTK message: This part of the CLTK depends upon word embedding models from the NLPL project." ) # pragma: no cover dl_is_allowed = query_yes_no( f"Do you want to download file '{model_url}' to '{self.fp_zip}'?" ) # type: bool if dl_is_allowed: get_file_with_progress_bar(model_url=model_url, file_path=self.fp_zip) else: raise CLTKException( f"Download of necessary Stanza model declined for '{self.language}'. Unable to continue with Stanza's processing." )
def divide_works(self, corpus): """Use the work-breaking option. TODO: Maybe incorporate this into ``convert_corpus()`` TODO: Write test for this """ if corpus == "tlg": orig_dir = make_cltk_path("originals/tlg") works_dir = make_cltk_path("grc/text/tlg/individual_works") file_prefix = "TLG" lat = False elif corpus == "phi5": orig_dir = make_cltk_path("originals/phi5") works_dir = make_cltk_path("lat/text/phi5/individual_works") file_prefix = "LAT" lat = True # this is for the optional TLGU argument to convert() elif corpus == "phi7": raise CLTKException( "``phi7`` cannot be divided into individual works.") else: raise CLTKException( f"Invalid corpus '{corpus}'. This should never happen.") if not os.path.exists(works_dir): os.makedirs(works_dir) files = os.listdir(orig_dir) texts = [ x for x in files if x.endswith(".TXT") and x.startswith(file_prefix) ] for file in texts: orig_file_path = os.path.join(orig_dir, file) new_file_path = os.path.join(works_dir, file) try: self.convert(orig_file_path, new_file_path, divide_works=True, lat=lat) logger.info("Writing files at %s to %s.", orig_file_path, works_dir) except Exception as err: logger.error("Failed to convert files: %s.", err)
def algorithm(self): valid_variants = ["fasttext", "nlpl"] if self.variant == "fasttext": return FastTextEmbeddings(iso_code=self.language) elif self.variant == "nlpl": return Word2VecEmbeddings(iso_code=self.language) else: valid_variants_str = "', '".join(valid_variants) raise CLTKException( f"Invalid embeddings ``variant`` ``{self.variant}``. Available: '{valid_variants_str}'." )
def download_stanza_model(iso_code: str) -> None: """Download language models, from the ``stanza`` project, that are supported by the CLTK or in scope. More here: `<https://stanfordnlp.github.io/stanza/models.html>_. TODO: Re-enable `treebank` parameter """ print(f"Going to download Stanza model for '{iso_code}'.") if iso_code not in AVAIL_STANZA_LANGS: raise CLTKException(f"Language '{iso_code}' not available for Stanza.") StanzaWrapper(language=iso_code, interactive=False, silent=False) print(f"Finished downloading Stanza for '{iso_code}'.")
def run(self, input_doc: Doc) -> Doc: lookup_algo = self.algorithm output_doc = deepcopy(input_doc) for word in output_doc.words: if self.language == "lat": word.definition = lookup_algo.lookup(word.lemma) elif self.language == "non": word.definition = lookup_algo.lookup(word.string) else: raise CLTKException( f"``LexiconProcess()`` not available for language '{self.language}' This should never happen." ) return output_doc
def _build_fasttext_filepath(self): """Create filepath at which to save a downloaded fasttext model. .. todo:: Do better than test for just name. Try trimming up to user home dir. >>> from cltk.embeddings.embeddings import FastTextEmbeddings # doctest: +SKIP >>> embeddings_obj = FastTextEmbeddings(iso_code="lat", silent=True) # doctest: +SKIP >>> vec_fp = embeddings_obj._build_fasttext_filepath() # doctest: +SKIP >>> os.path.split(vec_fp)[1] # doctest: +SKIP 'wiki.la.vec' >>> embeddings_obj = FastTextEmbeddings(iso_code="lat", training_set="bin", silent=True) # doctest: +SKIP >>> bin_fp = embeddings_obj._build_fasttext_filepath() # doctest: +SKIP >>> os.path.split(bin_fp)[1] # doctest: +SKIP 'wiki.la.bin' >>> embeddings_obj = FastTextEmbeddings(iso_code="lat", training_set="common_crawl", model_type="vec", silent=True) # doctest: +SKIP >>> os.path.split(vec_fp)[1] # doctest: +SKIP 'cc.la.300.vec' >>> embeddings_obj = FastTextEmbeddings(iso_code="lat", training_set="common_crawl", model_type="bin", silent=True) # doctest: +SKIP >>> bin_fp = embeddings_obj._build_fasttext_filepath() # doctest: +SKIP >>> vec_fp = embeddings_obj._build_fasttext_filepath() # doctest: +SKIP >>> os.path.split(bin_fp)[1] # doctest: +SKIP 'cc.la.300.bin' """ fasttext_code = MAP_LANGS_CLTK_FASTTEXT[self.iso_code] fp_model = None if self.training_set == "wiki": fp_model = os.path.join( CLTK_DATA_DIR, self.iso_code, "embeddings", "fasttext", f"wiki.{fasttext_code}.{self.model_type}", ) elif self.training_set == "common_crawl": fp_model = os.path.join( CLTK_DATA_DIR, self.iso_code, "embeddings", "fasttext", f"cc.{fasttext_code}.300.{self.model_type}", ) else: raise CLTKException( f"Unexpected ``training_set`` ``{self.training_set}``.") return fp_model
def _build_fasttext_url(self): """Make the URL at which the requested model may be downloaded.""" fasttext_code = self.MAP_LANGS_CLTK_FASTTEXT[self.iso_code] if self.training_set == "wiki": if self.model_type == "vec": ending = "vec" else: # for .bin ending = "zip" url = f"https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.{fasttext_code}.{ending}" elif self.training_set == "common_crawl": url = f"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.{fasttext_code}.300.{self.model_type}.gz" else: raise CLTKException("Unexpected exception.") return url
def syllabify(self, word: str, mode="SSP") -> Union[List[str], str]: """ :param word: word to syllabify :param mode: syllabification algorithm SSP (Sonority Sequence Principle) or MOP (Maximum Onset Principle) :return: syllabifier word """ if mode == "SSP": res = self.syllabify_ssp(word) elif mode == "MOP": res = self.syllabify_mop(word) else: raise CLTKException("Wrong given mode") if self.sep: return self.sep.join(res) return res
def lookup(self, lemma: str) -> str: """Perform match of a lemma against headwords. If more than one match, then return the concatenated entries. For example: >>> from cltk.lexicon.lat import LatinLewisLexicon >>> lll = LatinLewisLexicon(interactive=False) >>> lll.lookup("clemens")[:50] 'clēmēns entis (abl. -tī; rarely -te, L.), adj. wit' >>> all(word in lll.lookup("levis") for word in ["levis","lēvis"]) # Test for concatenated entries True >>> lll.lookup("omnia") '' >>> lll.lookup(".") '' >>> lll.lookup("123") '' >>> lll.lookup("175.") '' >>> lll.lookup("(") # Test for regex special character '' """ if not self.entries: raise CLTKException( "No lexicon entries found in the .yaml file. This should never happen." ) if regex.match(r"^[0-9\.\?,\:;\!\<\>\-]*$", lemma) is not None: return "" lemma = regex.escape(lemma.lower()) keys = self.entries.keys() matches = [ key for key in keys if regex.match(rf"^{lemma}[0-9]?$", key) ] n_matches = len(matches) if n_matches > 1: return "\n".join([self.entries[key] for key in matches]) elif n_matches == 1: return self.entries[matches[0]] else: return ""
def _check_and_download_tlgu_source(self): """Check if tlgu downloaded, if not download it.""" path = make_cltk_path("grc/software/grc_software_tlgu/tlgu.h") if not os.path.isfile(path): dl_msg = f"This part of the CLTK depends upon TLGU, software written by Dimitri Marinakis `<http://tlgu.carmen.gr/>`_." print(dl_msg) repo_url = "https://github.com/cltk/grc_software_tlgu.git" dl_dir = os.path.split(path)[0] dl_question = ( f"Do you want to download TLGU from '{repo_url}' to '{dl_dir}'?" ) if self.interactive: do_download = query_yes_no(question=dl_question) else: do_download = True if do_download: fetch_corpus = FetchCorpus(language="grc") fetch_corpus.import_corpus(corpus_name="grc_software_tlgu") else: raise CLTKException( f"TLGU software required for this class to work.")
def from_ud(feature_name: str, feature_value: str) -> Optional[MorphosyntacticFeature]: """For a given Universal Dependencies feature name and value, return the appropriate feature class/value. >>> from_ud('Case', 'Abl') ablative >>> from_ud('Abbr', 'Yes') pos >>> from_ud('PronType', 'Ind') indefinite """ # Do cleanup on certain inputs that look like ``"Number[psor]`` # Thus this is rewritten to ``feature_name = Number`` # and ``feature_value = psor``. if "[" in feature_name and "]" in feature_name: feature_name_split: List[str] = feature_name.split("[", maxsplit=1) feature_name = feature_name_split[0] feature_value = feature_name_split[1][:-1] feature_value = feature_value.title() if feature_name in from_ud_map: feature_map = from_ud_map[feature_name] else: msg1: str = f"Unrecognized UD `feature_name` ('{feature_name}') with `feature_value` ('{feature_value}')." msg2: str = f"Please raise an issue at <https://github.com/cltk/cltk/issues> and include a small sample to reproduce the error." print(msg1) print(msg2) # raise CLTKException(msg) return None values = feature_value.split(",") for value in values: if value in feature_map: return feature_map[value] else: raise CLTKException( f"{value}: Unrecognized value for UD feature {feature_name}")
def lookup(self, lemma: str) -> str: """Perform match of a lemma against headwords. If more than one match, then return the concatenated entries. For example: >>> lll = LatinLewisLexicon() >>> lll.lookup("clemens")[:50] 'clēmēns entis (abl. -tī; rarely -te, L.), adj. wit' >>> lll.lookup("omnia") '' >>> lll.lookup(".") '' >>> lll.lookup("123") '' >>> lll.lookup("175.") '' """ if not self.entries: raise CLTKException( "No lexicon entries found in the .yaml file. This should never happen." ) if regex.match(r"^[0-9\.\?,\:;\!\<\>\-]*$", lemma) is not None: return "" lemma = lemma.lower() keys = self.entries.keys() matches = [ key for key in keys if regex.match(rf"^{lemma}[0-9]?$", key) ] n_matches = len(matches) if n_matches > 1: return "\n".join([self.entries[key] for key in matches]) elif n_matches == 1: return self.entries[matches[0]] else: return ""
def __init__(self, interactive: bool = True): self.interactive = interactive self.lewis_yaml_fp = make_cltk_path( "lat", "lexicon", "cltk_lat_lewis_elementary_lexicon", "lewis.yaml") try: self.entries = self._load_entries() except FileNotFoundError: if self.interactive: dl_msg = f"This part of the CLTK depends upon Lewis's *An Elementary Latin Dictionary* (1890)." print(dl_msg) dl_question = "Do you want to download this?" do_download = query_yes_no(question=dl_question) else: do_download = True if do_download: fetch_corpus = FetchCorpus(language="lat") fetch_corpus.import_corpus( corpus_name="cltk_lat_lewis_elementary_lexicon") else: raise CLTKException( f"File '{self.lewis_yaml_fp}' is not found. It is required for this class." ) self.entries = self._load_entries()
def __init__(self, interactive: bool = True): self.interactive = interactive self.zoega_yaml_fp = make_cltk_path("non", "dictionary", "cltk_non_zoega_dictionary", "dictionary.yaml") try: self.entries = self._load_entries() except FileNotFoundError: if self.interactive: dl_msg = f"This part of the CLTK depends upon Zoëga's *A Concise Old Norse Dictionary* (1890)." print(dl_msg) dl_question = "Do you want to download this?" do_download = query_yes_no(question=dl_question) else: do_download = True if do_download: fetch_corpus = FetchCorpus(language="non") fetch_corpus.import_corpus( corpus_name="cltk_non_zoega_dictionary") else: raise CLTKException( f"File '{self.zoega_yaml_fp}' is not found. It is required for this class." ) self.entries = self._load_entries()
def cltk_doc_to_features_table( cltk_doc: Doc, ) -> Tuple[List[str], List[List[Union[str, int, float, None]]]]: """Take a CLTK ``Doc`` and return a list of lists ready for machine learning. This expects the default features available for Greek and Latin (word embeddings, morphology, syntax, lemmata). This should be improved to fail gracefully when less features available in the input ``Doc``. TODO: Fail gracefully when missing info in ``Doc``. """ if len(cltk_doc.sentences) < 1: raise CLTKException("Must contain at least one ``Doc.sentence``.") list_of_list_features = ( list() ) # type: List[List[Union[str, int, float, None, np.ndarray]]] for sentence in cltk_doc.sentences: for word in sentence: word_features_list = ( list() ) # type: List[Union[str, int, float, None, np.ndarray]] # note: this gets made and remade; only needs to be done once, at beginning or at end; need to add check that len == the actual instance row variable_names = list() # type: List[str] # Get word token chars word_features_list.append(word.string) variable_names.append("string") # Get lemma word_features_list.append(word.lemma) variable_names.append("lemma") # Get embedding word_features_list.append(word.embedding) variable_names.append("embedding") # Get stopword binary word_features_list.append(word.stop) variable_names.append("is_stop") # Get NER binary word_features_list.append(word.named_entity) variable_names.append("lemma") # Get morphological info pos_label = get_pos(word=word) word_features_list.append( pos_label ) # note: incorrectly labels upper-cased words as proper_noun, eg 'Βίβλος' variable_names.append("pos") feature_names, features_present = get_features(word=word) word_features_list += ( features_present # add the features list to the big list ) variable_names += feature_names # Get dependency info governing_word = get_governor_word(word=word, sentence=sentence) pos_label_governor = get_pos(word=governing_word) word_features_list.append(pos_label_governor) variable_names.append("governing_word") feature_names_governor, features_present_governor = get_features( word=governing_word, prepend_to_label="governor_" ) word_features_list += ( features_present_governor # add the features list to the big list ) variable_names += feature_names_governor # governor_edge = get_governor_relationship(word=word, sentence=sentence) # word_features_list.append(governor_edge) relation_type = word.dependency_relation word_features_list.append(relation_type) variable_names.append("governing_relationship") list_of_list_features.append(word_features_list) assert len(variable_names) == len( list_of_list_features[0] ), f"The names of variables ({len(variable_names)}) does not match then actual number of variables ({len(list_of_list_features[0])}). These must be equal." return variable_names, list_of_list_features
def decline(self, lemma: str, flatten: bool = False, collatinus_dict: bool = False) -> List[Tuple[str, str]]: """ Decline a lemma .. warning:: POS are incomplete as we do not detect the type outside of verbs, participle and adjective. :raise CLTKException: When the lemma is unknown to our data :param lemma: Lemma (Canonical form) to decline :type lemma: str :param flatten: If set to True, returns a list of forms without natural language information about them :type flatten: bool :param collatinus_dict: If sets to True, Dictionary of grammatically valid forms, including variants, with keys\ corresponding to morpho informations. :type collatinus_dict: bool :return: List of tuple where first value is the form and second the pos, ie [("sum", "v1ppip---")] :rtype: list or dict """ if lemma in self._lemmas: # Get data information lemma_entry = self._lemmas[lemma] elif lemma in self._mapped and self._mapped[lemma] in self._lemmas: # Get data information lemma = self._mapped[lemma] lemma_entry = self._lemmas[self._mapped[lemma]] else: raise CLTKException("%s is unknown" % lemma) model = self._models[lemma_entry["model"]] # Get the roots roots = self._getRoots(lemma, model=model) # Get the known forms in order keys = sorted([int(key) for key in model["des"].keys()]) forms_data = [(key, model["des"][str(key)]) for key in keys] # Generate the return dict forms = {key: [] for key in keys} for key, form_list in forms_data: for form in form_list: root_id, endings = tuple(form) for root in roots[root_id]: for ending in endings: forms[key].append(root + ending) # sufd means we have the original forms of the parent but we add a suffix if len(model["sufd"]): # For each constant form1 for key, iter_forms in forms.items(): new_forms = [] # We add the constant suffix for sufd in model["sufd"]: new_forms += [form + sufd for form in iter_forms] forms[key] = new_forms # If we need a secure version of the forms. For example, if we have variants if len(model["suf"]): cached_forms = {k: v + [] for k, v in forms.items() } # Making cache without using copy # For each suffix # The format is [suffix characters, [modified forms]] for suffixes in model["suf"]: suffix, modified_forms = suffixes[0], suffixes[1] for modified_form in modified_forms: forms[modified_form] += [ f + suffix for f in cached_forms[modified_form] ] # We update with the new roots # If some form do not exist, we delete them prehentively if len(model["abs"]): for abs_form in model["abs"]: if abs_form in forms: del forms[abs_form] if flatten: return list( [form for case_forms in forms.values() for form in case_forms]) elif collatinus_dict: return forms else: return list([(form, self.__getPOS(key)) for key, case_forms in forms.items() for form in case_forms])
def _getRoots(self, lemma, model): """ Retrieve the known roots of a lemma :param lemma: Canonical form of the word (lemma) :type lemma: str :param model: Model data from the loaded self.__data__. Can be passed by decline() :type model: dict :return: Dictionary of roots with their root identifier as key :rtype: dict """ if lemma not in self._lemmas: raise CLTKException("%s is unknown" % lemma) ROOT_IDS = {"K": "lemma", "1": "geninf", "2": "perf"} lemma_entry = self._lemmas[lemma] if "quantity" in lemma_entry and lemma_entry["quantity"]: lemma_in_lemma_entry = lemma_entry["quantity"] else: lemma_in_lemma_entry = self._remove_disambiguation( lemma_entry["lemma"]) original_roots = { root_id: lemma_entry[root_name].split(",") for root_id, root_name in ROOT_IDS.items() if root_id != "K" and lemma_entry[root_name] } returned_roots = {} if not model: model = self._models[lemma_entry["model"]] # For each registered root in the model, for model_root_id, model_root_data in model["R"].items(): # If we have K, it's equivalent to canonical form if model_root_data[0] == "K": returned_roots[model_root_id] = lemma_in_lemma_entry.split(",") # Otherwise we have deletion number and addition char else: deletion, addition = int( model_root_data[0]), model_root_data[1] or "" # If a the root is declared already, # we retrieve the information if model_root_id != "1" and model_root_id in returned_roots: lemma_roots = returned_roots[model_root_id] else: lemma_roots = lemma_in_lemma_entry.split(",") # We construct the roots returned_roots[model_root_id] = [ lemma_root[:-deletion] + addition for lemma_root in lemma_roots ] if model_root_id in original_roots: returned_roots[model_root_id].extend( original_roots[model_root_id]) returned_roots[model_root_id] = list( set(returned_roots[model_root_id])) original_roots.update(returned_roots) return original_roots
def _check_install(self): """Check if tlgu installed, if not install it.""" try: subprocess.check_output(["which", "tlgu"]) except subprocess.SubprocessError as sub_err: print("TLGU not installed.") logger.info("TLGU not installed: %s", sub_err) logger.info("Installing TLGU.") if not subprocess.check_output(["which", "gcc"]): logger.error("GCC seems not to be installed.") else: tlgu_path = make_cltk_path("grc/software/grc_software_tlgu") if self.interactive: install_question = "Do you want to install TLGU?" do_install = query_yes_no(question=install_question) if not do_install: raise CLTKException( "TLGU installation required for this class to work." ) else: print("Non-interactive installation. Continuing ...") command = "cd {0} && make install".format(tlgu_path) print(f"Going to run command: ``{command}``") try: p_out = subprocess.call(command, shell=True) except subprocess.SubprocessError as sub_err: print( "Error executing installation. Going to check output of ``subprocess.call()`` ..." ) raise CLTKException(sub_err) if p_out == 0: msg = "TLGU installed." print(msg) logger.info(msg) return True else: msg = "TLGU install without sudo failed. Going to try again with sudo (usually required for Linux) ..." print(msg) logger.error(msg) command = "cd {0} && sudo make install".format(tlgu_path) if self.interactive: install_question = "Do you want to install TLGU? with sudo?" do_install = query_yes_no(question=install_question) if not do_install: raise CLTKException( "TLGU installation required for this class to work." ) p_out = subprocess.call(command, shell=True) else: print("Going to run command:", command) p_out = subprocess.call(command, shell=True) if p_out == 0: msg = "TLGU installed." print(msg) logger.info(msg) else: msg = "TLGU install with sudo failed." print(msg) logger.error(msg) raise CLTKException( "TLGU installation required for this class to work.")
PARSER = argparse.ArgumentParser() PARSER.add_argument( "--languages", help="What languages to download. Comma separated, no spaces.") ARGS = PARSER.parse_args() SELECTED_LANGS = list() # type: List[str] ALL_AVAILABLE_LANGS = list(iso_to_pipeline.keys()) # type: List[str] if not ARGS.languages: SELECTED_LANGS = ALL_AVAILABLE_LANGS else: SELECTED_LANGS_SPLIT = ARGS.languages.split(",") for LANG in SELECTED_LANGS_SPLIT: if LANG not in ALL_AVAILABLE_LANGS: raise CLTKException( f"Unavailable language '{LANG}' chosen. Choose from: {', '.join(ALL_AVAILABLE_LANGS)}" ) SELECTED_LANGS = SELECTED_LANGS_SPLIT def download_stanza_model(iso_code: str) -> None: """Download language models, from the ``stanza`` project, that are supported by the CLTK or in scope. More here: `<https://stanfordnlp.github.io/stanza/models.html>_. TODO: Re-enable `treebank` parameter """ print(f"Going to download Stanza model for '{iso_code}'.") if iso_code not in AVAIL_STANZA_LANGS: raise CLTKException(f"Language '{iso_code}' not available for Stanza.") StanzaWrapper(language=iso_code, interactive=False, silent=False)