def load_parser(chunker): # load spacy parser logger.info('loading spacy. chunker=%s', chunker) if 'nlp_arch' in chunker: parser = SpacyInstance(model='en_core_web_sm', disable=['textcat', 'ner', 'parser']).parser parser.add_pipe(parser.create_pipe('sentencizer'), first=True) _path_to_model = path.join(chunker_path, chunker_model_file) _path_to_params = path.join(chunker_path, chunker_model_dat_file) if not path.exists(chunker_path): makedirs(chunker_path) if not path.exists(_path_to_model): logger.info( 'The pre-trained model to be downloaded for NLP Architect' ' word chunker model is licensed under Apache 2.0') download_unlicensed_file(nlp_chunker_url, chunker_model_file, _path_to_model) if not path.exists(_path_to_params): download_unlicensed_file(nlp_chunker_url, chunker_model_dat_file, _path_to_params) parser.add_pipe(NPAnnotator.load(_path_to_model, _path_to_params), last=True) else: parser = SpacyInstance(model='en_core_web_sm', disable=['textcat', 'ner']).parser logger.info('spacy loaded') return parser
def load_parser(chunker): # load spacy parser logger.info("loading spacy. chunker=%s", chunker) if "nlp_arch" in chunker: parser = SpacyInstance(model="en_core_web_sm", disable=["textcat", "ner", "parser"]).parser parser.add_pipe(parser.create_pipe("sentencizer"), first=True) _path_to_model = path.join(chunker_path, chunker_model_file) _path_to_params = path.join(chunker_path, chunker_model_dat_file) if not path.exists(chunker_path): makedirs(chunker_path) if not path.exists(_path_to_model): logger.info( "The pre-trained model to be downloaded for NLP Architect" " word chunker model is licensed under Apache 2.0") download_unlicensed_file(nlp_chunker_url, chunker_model_file, _path_to_model) if not path.exists(_path_to_params): download_unlicensed_file(nlp_chunker_url, chunker_model_dat_file, _path_to_params) parser.add_pipe(NPAnnotator.load(_path_to_model, _path_to_params), last=True) else: parser = SpacyInstance(model="en_core_web_sm", disable=["textcat", "ner"]).parser logger.info("spacy loaded") return parser
def test_np_annotator_linked(model_path, settings_path, text, phrases): annotator = SpacyInstance(model="en", disable=["textcat", "ner", "parser"]).parser annotator.add_pipe(annotator.create_pipe("sentencizer"), first=True) annotator.add_pipe(NPAnnotator.load(model_path, settings_path), last=True) doc = annotator(text) noun_phrases = [p.text for p in get_noun_phrases(doc)] for p in phrases: assert p in noun_phrases
def __init__(self, parser=None): if parser is None: self.nlp = SpacyInstance( disable=['ner', 'parser', 'vectors', 'textcat']).parser else: self.nlp = parser self.nlp.add_pipe(self.nlp.create_pipe('sentencizer'), first=True) _path_to_model = path.join(chunker_local_path, chunker_model_file) if not path.exists(chunker_local_path): makedirs(chunker_local_path) if not path.exists(_path_to_model): logger.info( 'The pre-trained model to be downloaded for NLP Architect word' ' chunker model is licensed under Apache 2.0') download_unlicensed_file(nlp_chunker_url, chunker_model_file, _path_to_model) _path_to_params = path.join(chunker_local_path, chunker_model_dat_file) if not path.exists(_path_to_params): download_unlicensed_file(nlp_chunker_url, chunker_model_dat_file, _path_to_params) self.nlp.add_pipe(NPAnnotator.load(_path_to_model, _path_to_params), last=True)
logger.info('loading spacy') if 'nlp_arch' in args.chunker: nlp = SpacyInstance(model='en_core_web_sm', disable=['textcat', 'ner', 'parser']).parser nlp.add_pipe(nlp.create_pipe('sentencizer'), first=True) logger.info( 'The pre-trained model to be downloaded for NLP Architect word' ' chunker model is licensed under Apache 2.0') _path_to_model = path.join(cur_dir, chunker_model_file) download_unlicensed_file(nlp_chunker_url, chunker_model_file, _path_to_model) _path_to_params = path.join(cur_dir, chunker_model_dat_file) download_unlicensed_file(nlp_chunker_url, chunker_model_dat_file, _path_to_params) logger.info('Done.') nlp.add_pipe(NPAnnotator.load(_path_to_model, _path_to_params), last=True) else: nlp = SpacyInstance(model='en_core_web_sm', disable=['textcat', 'ner']).parser logger.info('spacy loaded') num_lines = sum(1 for line in corpus_file) corpus_file.seek(0) logger.info('%i lines in corpus', num_lines) i = 0 with tqdm(total=num_lines) as pbar: for doc in nlp.pipe(corpus_file, n_threads=-1): if 'nlp_arch' in args.chunker: spans = get_noun_phrases(doc)
def test_np_annotator_load(model_path, settings_path): assert NPAnnotator.load(model_path, settings_path)