Example #1
0
    def __init__(self, snlp, meta=None, **kwargs):
        """Initialize the Language class.

        Instead of "en" etc. we call the language "stanza_en" to not
        cause conflicts with spaCy's built-in languages. Using entry points,
        this also allows serializing and deserializing the language class
        and "lang": "stanza_en" in the meta.json will automatically
        instantiate this class if this package is available.

        snlp (stanza.Pipeline): The loaded Stanza pipeline.
        kwargs: Optional config parameters.
        RETURNS (spacy.language.Language): The nlp object.
        """
        if hasattr(snlp, "lang"):
            lang = snlp.lang
        else:
            # backward compatible with stanza v1.0.0
            lang = snlp.processors["tokenize"].config["lang"]
        self.snlp = snlp
        self.svecs = StanzaLanguage._find_embeddings(snlp)
        self.lang = "stanza_" + lang
        self.Defaults = get_defaults(lang)
        self.vocab = create_vocab(lang, self.Defaults)
        self.tokenizer = Tokenizer(snlp, self.vocab)
        self._components = []
        self._disabled = set()
        self.max_length = kwargs.get("max_length", 10**6)
        self.batch_size = kwargs.get("batch_size", 256)
        self._meta = ({
            "lang": self.lang,
            "stanza": snlp.config
        } if meta is None else dict(meta))
        self._path = None
        self._optimizer = None
Example #2
0
 def __init__(self,convUD):
   self.Defaults.lex_attr_getters[LANG]=lambda _text:"eu"
   try:
     self.vocab=self.Defaults.create_vocab()
     self.pipeline=[]
   except:
     from spacy.vocab import create_vocab
     self.vocab=create_vocab("eu",self.Defaults)
     self._components=[]
     self._disabled=set()
   self.tokenizer=ixaKatTokenizer(self.vocab,convUD)
   self._meta={
     "author":"Koichi Yasuoka",
     "description":"derived from ixaKat",
     "lang":"eu_ixaKat",
     "license":"MIT",
     "name":"eu_ixaKat",
     "pipeline":"Tokenizer, POS-Tagger, Parser",
     "spacy_version":">=2.2.2"
   }
   self._path=None
Example #3
0
 def __init__(self,UniDic):
   self.Defaults.lex_attr_getters[LANG]=lambda _text:"ja"
   try:
     self.vocab=self.Defaults.create_vocab()
     self.pipeline=[]
   except:
     from spacy.vocab import create_vocab
     self.vocab=create_vocab("ja",self.Defaults)
     self._components=[]
     self._disabled=set()
   self.tokenizer=SynChaTokenizer(self.vocab,UniDic)
   self._meta={
     "author":"Koichi Yasuoka",
     "description":"derived from SynCha-CaboCha-MeCab",
     "lang":"ja_SynCha_CaboCha_MeCab",
     "license":"MIT",
     "name":"SynCha_CaboCha_MeCab",
     "pipeline":"Tokenizer, POS-Tagger, Parser",
     "spacy_version":">=2.2.2"
   }
   self._path=None
Example #4
0
 def __init__(self,BERT,Danku):
   self.Defaults.lex_attr_getters[LANG]=lambda _text:"lzh"
   try:
     self.vocab=self.Defaults.create_vocab()
     self.pipeline=[]
   except:
     from spacy.vocab import create_vocab
     self.vocab=create_vocab("lzh",self.Defaults)
     self._components=[]
     self._disabled=set()
   self.tokenizer=SuParKanbunTokenizer(BERT,Danku,self.vocab)
   self._meta={
     "author":"Koichi Yasuoka",
     "description":"derived from SuParKanbun",
     "lang":"SuParKanbun_lzh",
     "license":"MIT",
     "name":"SuParKanbun_lzh",
     "parent_package":"suparkanbun",
     "pipeline":"Tokenizer, POS-Tagger, Parser",
     "spacy_version":">=2.1.0"
   }
   self._path=None
Example #5
0
 def __init__(self,UniDic,UDPipe):
   self.Defaults.lex_attr_getters[LANG]=lambda _text:"ja"
   try:
     self.vocab=self.Defaults.create_vocab()
     self.pipeline=[]
   except:
     from spacy.vocab import create_vocab
     self.vocab=create_vocab("ja",self.Defaults)
     self._components=[]
     self._disabled=set()
   self.tokenizer=UniDicTokenizer(UniDic,UDPipe,self.vocab)
   self._meta={
     "author":"Koichi Yasuoka",
     "description":"derived from UniDic2UD",
     "lang":"UniDic_"+UniDic if UniDic!=None else "udpipe_ja-modern",
     "license":"MIT",
     "name":UniDic if UniDic!=None else "ja-modern",
     "parent_package":"spacy_unidic",
     "pipeline":"Tokenizer, POS-Tagger, Parser",
     "spacy_version":">=2.1.0"
   }
   self._path=None
Example #6
0
 def __init__(self, api):
     self.Defaults.lex_attr_getters[LANG] = lambda _text: "cop"
     try:
         self.vocab = self.Defaults.create_vocab()
         self.pipeline = []
     except:
         from spacy.vocab import create_vocab
         self.vocab = create_vocab("cop", self.Defaults)
         self._components = []
         self._disabled = set()
     self.tokenizer = CopticTokenizer(api, self.vocab)
     self._meta = {
         "author": "Koichi Yasuoka",
         "description": "derived from Coptic-NLP",
         "lang": "Coptic_NLP_cop",
         "license": "MIT",
         "name": "Coptic_NLP_cop",
         "parent_package": "Coptic-NLP",
         "pipeline": "Tokenizer, POS-Tagger, Parser",
         "spacy_version": ">=2.1.0"
     }
     self._path = None
Example #7
0
    def __init__(self,
                 udpipe_model: UDPipeModel,
                 meta: Optional[Dict] = None,
                 **kwargs):
        """Initialize the Language class.

        The language is called "udpipe_en" instead of "en" in order to
        avoid any potential conflicts with spaCy's built-in languages.
        Using entry points, this enables serializing and deserializing
        the language class and "lang": "udpipe_en" in the meta.json will
        automatically instantiate this class if this package is available.

        udpipe_model: The loaded UDPipe model.
        meta: spaCy model metadata.
        kwargs: Optional config parameters.
        """
        self.udpipe = udpipe_model
        self.Defaults = get_defaults(lang=udpipe_model._lang)
        self.lang = f"udpipe_{udpipe_model._lang}"
        ignore_tag_map = kwargs.get("ignore_tag_map", False)
        if ignore_tag_map:
            self.Defaults.tag_map = {}  # workaround for ValueError: [E167]
        if SPACY_V3:
            from spacy.vocab import create_vocab
            from spacy.language import DEFAULT_CONFIG
            self.vocab = create_vocab(udpipe_model._lang, self.Defaults)
            self.batch_size = 1000
            self._components = []
            self._disabled = set()
            self._config = DEFAULT_CONFIG.merge(self.default_config)
        else:
            self.vocab = self.Defaults.create_vocab()
            self.pipeline = []
        self.tokenizer = UDPipeTokenizer(model=self.udpipe, vocab=self.vocab)
        self.max_length = kwargs.get("max_length", 10**6)
        self._meta = self.udpipe._meta if meta is None else dict(meta)
        self._path = None
        self._optimizer = None