def process_tokenlist(nlp, tokenlist, enriched=False): """process_tokenlist: creates a spacy doc element of a token list :param nlp: spacy NLP element :param tokenlist: list of dicts containing tokens and parameters :param enriched: if set to True spacy pipeline is run """ json = {} json['tokenArray'] = tokenlist ar_tok = [x['value'] for x in json['tokenArray']] ar_wsp = [x.get('whitespace', True) for x in json['tokenArray']] if Token.get_extension('tokenId') is None: Token.set_extension('tokenId', default=False) doc = Doc(nlp.vocab, words=ar_tok, spaces=ar_wsp) for id, t in enumerate(doc): t._.set('tokenId', json['tokenArray'][id].get('tokenId', False)) t_type = json['tokenArray'][id].get('type', False) if not t.tag_ and t_type: t.tag_ = t_type for k in json['tokenArray'][id].keys(): if k.upper() in SPACY_ACCEPTED_DATA: setattr( t, k.lower(), json['tokenArray'][id][k], ) # TODO: need to set ent_iob if enriched: for name, proc in nlp.pipeline: doc = proc(doc) return doc
def __init__(self, data_dir=DATA_DIR, model_dir_path=None, lexicon_file_path=None, tag_file_path=None, package=PACKAGE, url=URL_MODEL, print_probas=False): super(POSTagger, self).__init__(package, url=url, download_dir=data_dir) if not tk.get_extension(self.name): tk.set_extension(self.name, default=None) else: LOGGER.info('Token {} already registered'.format(self.name)) model_dir_path = model_dir_path if model_dir_path else os.path.join( data_dir, package, 'models/fr') lexicon_file_path = lexicon_file_path if lexicon_file_path else os.path.join( model_dir_path, 'lexicon.json') tag_file_path = tag_file_path if tag_file_path else os.path.join( model_dir_path, 'tag_dict.json') LOGGER.info(" TAGGER: Loading lexicon...") self.lex_dict = unserialize(lexicon_file_path) LOGGER.info(" TAGGER: Loading tags...") self.tag_dict = unserialize(tag_file_path) self.classifier = MaxEntClassifier() self.cache = {} self._load_model(model_dir_path) # print the probability of the tag along to the tag itself self.print_probas = print_probas return
class JapaneseDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda _text: "ja" stop_words = STOP_WORDS tag_map = TAG_MAP syntax_iterators = SYNTAX_ITERATORS writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} if not Token.get_extension('inf'): Token.set_extension('inf', default='') if not Token.get_extension('reading'): Token.set_extension('reading', default='') if not Token.get_extension('sudachi'): Token.set_extension('sudachi', default='') if not Token.get_extension('bunsetu_index'): Token.set_extension('bunsetu_index', default='') if not Token.get_extension('bunsetu_bi_label'): Token.set_extension('bunsetu_bi_label', default='') if not Token.get_extension('bunsetu_position_type'): Token.set_extension('bunsetu_position_type', default='') if not Token.get_extension('ne'): Token.set_extension('ne', default='') @classmethod def create_tokenizer(cls, nlp=None): return SudachipyTokenizer(nlp) @classmethod def create_lemmatizer(cls, nlp=None, lookups=None): return None
def _add_custom_spacy_extensions(self): for n, f in self.is_attrs_name2func: ext = Token.get_extension(n) if ext is None: Token.set_extension(n, getter=f, force=True) for item in [Span, Doc]: for n, f in self.has_attrs_name2func: ext = item.get_extension(n) if ext is None: #print(f"Setting: {item}.set_extension({n}, getter= {f})") item.set_extension(n, getter=f, force=True) # Add Attr Getters for Span (i.e. Doc.ents) for n, f in self.get_attrs_name2func: ext = Span.get_extension(n) if ext is None: Span.set_extension(n, getter=f, force=True)
def __init__(self, data_dir=DATA_DIR, lefff_file_name=LEFFF_FILE_NAME, after_melt=False): LOGGER.info('New LefffLemmatizer instantiated.') # register your new attribute token._.lefff_lemma if not Token.get_extension(self.name): Token.set_extension(self.name, default=None) else: LOGGER.info('Token {} already registered'.format(self.name)) # In memory lemma mapping self.lemma_dict = {} self.after_melt = after_melt with io.open(os.path.join(data_dir, lefff_file_name), encoding='utf-8') as lefff_file: LOGGER.info('Reading lefff data...') for line in lefff_file: els = line.split('\t') self.lemma_dict[(els[0], els[1])] = els[2] LOGGER.info('Successfully loaded lefff lemmatizer')
def __init__(self, data_dir=DATA_DIR, lexicon_file_name=LEXICON_FILE, tag_file_name=TAG_DICT, print_probas=False): super(POSTagger, self).__init__(PACKAGE, url=URL_MODEL, download_dir=DATA_DIR) if not tk.get_extension(self.name): tk.set_extension(self.name, default=None) else: LOGGER.info('Token {} already registered'.format(self.name)) LOGGER.info(" TAGGER: Loading lexicon...") self.lex_dict = unserialize(lexicon_file_name) LOGGER.info(" TAGGER: Loading tags...") self.tag_dict = unserialize(tag_file_name) self.classifier = MaxEntClassifier() self.cache = {} self.load_model() # print the probability of the tag along to the tag itself self.print_probas = print_probas return
from .sudachi_tokenizer import SudachiTokenizer, LANG_NAME, TAG_MAP, SUDACHI_DEFAULT_MODE from .parse_tree import correct_dep from .syntax_iterators import SYNTAX_ITERATORS, noun_chunks __all__ = [ 'Japanese', 'JapaneseCorrector', 'load_model', 'save_model', 'create_model_path', ] Language.factories['JapaneseCorrector'] = lambda nlp, **cfg: JapaneseCorrector( nlp) if not Token.get_extension('pos_detail'): Token.set_extension('pos_detail', default='') if not Token.get_extension('inf'): Token.set_extension('inf', default='') class JapaneseDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: LANG_NAME tag_map = TAG_MAP syntax_iterators = SYNTAX_ITERATORS # TODO not works for spaCy 2.0.12, see work around in JapaneseCorrector @classmethod def create_tokenizer(cls, nlp=None): return SudachiTokenizer(nlp)
from spacy.vocab import Vocab from spacy.compat import copy_reg from .stop_words import STOP_WORDS from .syntax_iterators import SYNTAX_ITERATORS from .tag_map import TAG_MAP from .japanese_corrector import JapaneseCorrector from .sudachi_tokenizer import SudachiTokenizer ShortUnitWord = namedtuple("ShortUnitWord", ["surface", "lemma", "pos"]) Language.factories['JapaneseCorrector'] = lambda nlp, **cfg: JapaneseCorrector( nlp) if not Token.get_extension('inf'): Token.set_extension('inf', default='') if not Token.get_extension('bunsetu_bi_label'): Token.set_extension('bunsetu_bi_label', default='') if not Token.get_extension('bunsetu_position_type'): Token.set_extension('bunsetu_position_type', default='') class JapaneseDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda _text: "ja" stop_words = STOP_WORDS tag_map = TAG_MAP syntax_iterators = SYNTAX_ITERATORS # TODO not works for spaCy 2.0.12, see work around in JapaneseCorrector writing_system = { "direction": "ltr",