Ejemplo n.º 1
0
def download_fasttext_model(iso_code: str,
                            model_source: str = "wiki",
                            interactive=False) -> None:
    """Download fasttext model.

    TODO: Add way to specify a Common Crawl model.
    """
    print(f"Going to download fasttext model for '{iso_code}'.")
    avail_sources = ["wiki", "common_crawl"]
    assert (
        model_source in avail_sources
    ), f"Invalid `model_source`. Choose from: {', '.join(avail_sources)}."
    all_wiki_models = ["ang", "arb", "arc", "got", "lat", "pli", "san"]
    if model_source == "wiki" and iso_code not in all_wiki_models:
        raise CLTKException(
            f"Language '{iso_code}' not available for `model_source` '{model_source}'. Choose from: {', '.join(all_wiki_models)}."
        )
    all_common_crawl_models = ["arb", "lat", "san"]
    if model_source == "common_crawl" and iso_code not in all_common_crawl_models:
        raise CLTKException(
            f"Language '{iso_code}' not available for `model_source` '{model_source}'. Choose from: {', '.join(all_common_crawl_models)}."
        )
    FastTextEmbeddings(iso_code=iso_code,
                       interactive=interactive,
                       overwrite=False,
                       silent=False)
    print(f"Finished downloading fasttext for '{iso_code}'.")
def get_all_fasttext_models(interactive=False) -> None:
    all_wiki_models = ["ang", "arb", "arc", "got", "lat", "pli", "san"]
    # all_common_crawl_models = ["arb", "lat", "san"]
    for lang in all_wiki_models:
        FastTextEmbeddings(iso_code=lang,
                           interactive=interactive,
                           overwrite=False,
                           silent=False)
Ejemplo n.º 3
0
 def algorithm(self):
     valid_variants = ["fasttext", "nlpl"]
     if self.variant == "fasttext":
         return FastTextEmbeddings(iso_code=self.language)
     elif self.variant == "nlpl":
         return Word2VecEmbeddings(iso_code=self.language)
     else:
         valid_variants_str = "', '".join(valid_variants)
         raise CLTKException(
             f"Invalid embeddings ``variant`` ``{self.variant}``. Available: '{valid_variants_str}'."
         )
Ejemplo n.º 4
0
    def test_embeddings_fasttext(self):
        embeddings_obj = FastTextEmbeddings(
            iso_code="ang", interactive=False, silent=True, overwrite=False
        )
        most_similar_word = embeddings_obj.get_sims(word="m┼Нna├╛")[0][0]
        self.assertEqual(most_similar_word, "h─Бli╚Эm┼Нna├╛")

        embeddings_obj = FastTextEmbeddings(
            iso_code="arb", interactive=False, silent=True, overwrite=False
        )
        most_similar_word = embeddings_obj.get_sims(word="╪и╪╣╪п┘З╪з")[0][0]
        self.assertEqual(most_similar_word, "┘И╪и╪╣╪п┘З╪з")

        embeddings_obj = FastTextEmbeddings(
            iso_code="arc", interactive=False, silent=True, overwrite=False
        )
        most_similar_word = embeddings_obj.get_sims(word="▄Т▄а▄Ъ▄Ш▄Х")[0][0]
        self.assertEqual(most_similar_word, "▄а▄Т▄к")

        embeddings_obj = FastTextEmbeddings(
            iso_code="got", interactive=False, silent=True, overwrite=False
        )
        most_similar_word = embeddings_obj.get_sims(word="ЁРНЕЁРМ░ЁРМ╣ЁРМ╖ЁРНДЁРМ╣ЁРМ╜ЁРНГ")[0][0]
        self.assertEqual(most_similar_word, "ЁРНЕЁРМ░ЁРМ╣ЁРМ╖ЁРНДЁРНГ")

        embeddings_obj = FastTextEmbeddings(
            iso_code="lat", interactive=False, silent=True, overwrite=False
        )
        most_similar_word = embeddings_obj.get_sims(word="amicitia")[0][0]
        self.assertEqual(most_similar_word, "amicitiam")

        embeddings_obj = FastTextEmbeddings(
            iso_code="pli", interactive=False, silent=True, overwrite=False
        )
        most_similar_word = embeddings_obj.get_sims(word="anattaman─Б")[0][0]
        self.assertEqual(most_similar_word, "kupit─Б")

        embeddings_obj = FastTextEmbeddings(
            iso_code="san", interactive=False, silent=True, overwrite=False
        )
        most_similar_word = embeddings_obj.get_sims(word="рдирд┐рд░реНрдорд╛рдгрдореН")[0][0]
        self.assertEqual(most_similar_word, "рдирд┐рд░реНрдорд╛рдгрдордкрд┐")

        self.assertIsInstance(embeddings_obj, FastTextEmbeddings)

        with self.assertRaises(
            UnimplementedAlgorithmError
        ) as exception_context_manager:
            FastTextEmbeddings(
                iso_code="ave", interactive=False, silent=True, overwrite=False
            )
        exception = exception_context_manager.exception
        self.assertEqual(
            exception.args,
            (
                "No embedding available for language 'ave'. FastTextEmbeddings available for: 'ang', 'arb', 'arc', 'got', 'lat', 'pli', 'san'.",
            ),
        )

        with self.assertRaises(UnknownLanguageError) as exception_context_manager:
            FastTextEmbeddings(
                iso_code="xxx", interactive=False, silent=True, overwrite=False
            ),
        exception = exception_context_manager.exception
        self.assertEqual(exception.args, ("Unknown ISO language code 'xxx'.",))

        with self.assertRaises(CLTKException) as exception_context_manager:
            FastTextEmbeddings(
                iso_code="got",
                training_set="common_crawl",
                interactive=False,
                silent=True,
                overwrite=False,
            ),
        exception = exception_context_manager.exception
        self.assertEqual(
            exception.args,
            (
                "Training set 'common_crawl' not available for language 'got'. Languages available for this training set: 'arb', 'lat', 'san'.",
            ),
        )