def get_spacy_model(spacy_model_name: str, pos_tags: bool, parse: bool, ner: bool, with_custom_tokenizer: bool = False, with_sentence_segmenter: bool = False) -> SpacyModelType: """ In order to avoid loading spacy models repeatedly, we'll save references to them, keyed by the options we used to create the spacy model, so any particular configuration only gets loaded once. """ options = (spacy_model_name, pos_tags, parse, ner, with_custom_tokenizer, with_sentence_segmenter) if options not in LOADED_SPACY_MODELS: disable = ['vectors', 'textcat'] if not pos_tags: disable.append('tagger') if not parse: disable.append('parser') if not ner: disable.append('ner') try: spacy_model = spacy.load(spacy_model_name, disable=disable) except OSError: print(f"Spacy models '{spacy_model_name}' not found. Downloading and installing.") spacy_download(spacy_model_name) spacy_model = spacy.load(spacy_model_name, disable=disable) if with_custom_tokenizer: spacy_model.tokenizer = combined_rule_tokenizer(spacy_model) if with_sentence_segmenter: spacy_model.add_pipe(combined_rule_sentence_segmenter, first=True) LOADED_SPACY_MODELS[options] = spacy_model return LOADED_SPACY_MODELS[options]
def train(self): with self.snapshot.training_lock(): spacy_model_name = os.environ.get('NERD_SPACY_MODEL') with log_perf(f'{self.snapshot} TRAINING'): try: self._nlp = spacy.load(spacy_model_name) except OSError: logger.warning( f"Spacy model '{spacy_model_name}' not found. Downloading and installing." ) from spacy.cli.download import download as spacy_download spacy_download(spacy_model_name) from spacy.cli import link from spacy.util import get_package_path package_path = get_package_path(spacy_model_name) link(spacy_model_name, spacy_model_name, force=True, package_path=package_path) self._nlp = spacy.load(spacy_model_name) self._add_types() self._train_snapshot_texts() """ Only locking when saving to disk after training is done in memory """ with log_perf(f'{self.snapshot} SAVING_TO_DISK'): if os.path.exists(self._path): shutil.rmtree(self._path) self._nlp.to_disk(self._path)
def load_lang_model(lang: str, disable: List[str]): """Load spaCy language model or download if model is available and not installed Arguments: lang {str} -- language disable {List[str]} -- If only using tokenizer, can disable ['parser', 'ner', 'textcat'] Returns: [type] -- [description] """ if 'coref' in lang: try: return spacy.load(lang, disable=disable) # except Exception as e: return SpacyAnnotator.load_lang_model(lang.split('_')[0], disable=disable) try: return spacy.load(lang, disable=disable) except OSError: logger.warning( f"Spacy models '{lang}' not found. Downloading and installing." ) spacy_download(lang) # NOTE(mattg): The following four lines are a workaround suggested by Ines for spacy # 2.1.0, which removed the linking that was done in spacy 2.0. importlib doesn't find # packages that were installed in the same python session, so the way `spacy_download` # works in 2.1.0 is broken for this use case. These four lines can probably be removed # at some point in the future, once spacy has figured out a better way to handle this. # See https://github.com/explosion/spaCy/issues/3435. from spacy.cli import link from spacy.util import get_package_path package_path = get_package_path(lang) link(lang, lang, model_path=package_path) return spacy.load(lang, disable=disable)
def get_spacy_model(spacy_model_name: str, pos_tags: bool, parse: bool, ner: bool) -> SpacyModelType: """ In order to avoid loading spacy models a whole bunch of times, we'll save references to them, keyed by the options we used to create the spacy model, so any particular configuration only gets loaded once. """ options = (spacy_model_name, pos_tags, parse, ner) if options not in LOADED_SPACY_MODELS: disable = ['vectors', 'textcat'] if not pos_tags: disable.append('tagger') if not parse: disable.append('parser') if not ner: disable.append('ner') try: spacy_model = spacy.load(spacy_model_name, disable=disable) except OSError: logger.warning(f"Spacy models '{spacy_model_name}' not found. Downloading and installing.") spacy_download(spacy_model_name) # NOTE(mattg): The following four lines are a workaround suggested by Ines for spacy # 2.1.0, which removed the linking that was done in spacy 2.0. importlib doesn't find # packages that were installed in the same python session, so the way `spacy_download` # works in 2.1.0 is broken for this use case. These four lines can probably be removed # at some point in the future, once spacy has figured out a better way to handle this. # See https://github.com/explosion/spaCy/issues/3435. from spacy.cli import link from spacy.util import get_package_path package_path = get_package_path(spacy_model_name) link(spacy_model_name, spacy_model_name, model_path=package_path) spacy_model = spacy.load(spacy_model_name, disable=disable) LOADED_SPACY_MODELS[options] = spacy_model return LOADED_SPACY_MODELS[options]
def get_spacy_model(spacy_model_name: str, pos_tags: bool, parse: bool, ner: bool) -> SpacyModelType: """ In order to avoid loading spacy models a whole bunch of times, we'll save references to them, keyed by the options we used to create the spacy model, so any particular configuration only gets loaded once. """ options = (spacy_model_name, pos_tags, parse, ner) if options not in LOADED_SPACY_MODELS: disable = ['vectors', 'textcat'] if not pos_tags: disable.append('tagger') if not parse: disable.append('parser') if not ner: disable.append('ner') try: spacy_model = spacy.load(spacy_model_name, disable=disable) except OSError: logger.warning(f"Spacy models '{spacy_model_name}' not found. Downloading and installing.") spacy_download(spacy_model_name) spacy_model = spacy.load(spacy_model_name, disable=disable) LOADED_SPACY_MODELS[options] = spacy_model return LOADED_SPACY_MODELS[options]
def get_spacy_model(spacy_model_name: str, pos_tags: bool, parse: bool, ner: bool) -> SpacyModelType: """ In order to avoid loading spacy models a whole bunch of times, we'll save references to them, keyed by the options we used to create the spacy model, so any particular configuration only gets loaded once. """ options = (spacy_model_name, pos_tags, parse, ner) if options not in LOADED_SPACY_MODELS: disable = ["vectors", "textcat"] if not pos_tags: disable.append("tagger") if not parse: disable.append("parser") if not ner: disable.append("ner") try: spacy_model = spacy.load(spacy_model_name, disable=disable) except OSError: logger.warning( f"Spacy models '{spacy_model_name}' not found. Downloading and installing." ) spacy_download(spacy_model_name) # Import the downloaded model module directly and load from there spacy_model_module = __import__(spacy_model_name) spacy_model = spacy_model_module.load( disable=disable) # type: ignore LOADED_SPACY_MODELS[options] = spacy_model return LOADED_SPACY_MODELS[options]
def __init__( self, model="en", disable=None, display_prompt=True, n_jobs=8, batch_size=1500, spacy_doc=False, show_tok=True, show_doc=True, ptb_pos=False, ): if disable is None: disable = [] try: self._parser = spacy.load(model, disable=disable) except OSError: url = "https://spacy.io/models" if display_prompt and license_prompt("Spacy {} model".format(model), url) is False: sys.exit(0) spacy_download(model) print("Spacy model installed, please rerun your command.") sys.exit(0) self.n_jobs = n_jobs self.batch_size = batch_size self.spacy_doc = spacy_doc self.show_tok = show_tok self.show_doc = show_doc self.ptb_pos = ptb_pos
def get_spacy_model(spacy_model_name: str, pos_tags: bool, parse: bool, ner: bool) -> SpacyModelType: """ In order to avoid loading spacy models a whole bunch of times, we'll save references to them, keyed by the options we used to create the spacy model, so any particular configuration only gets loaded once. """ options = (spacy_model_name, pos_tags, parse, ner) if options not in LOADED_SPACY_MODELS: disable = ['vectors', 'textcat'] if not pos_tags: disable.append('tagger') if not parse: disable.append('parser') if not ner: disable.append('ner') try: spacy_model = spacy.load(spacy_model_name, disable=disable) except OSError: logger.warning( f"Spacy models '{spacy_model_name}' not found. Downloading and installing." ) spacy_download(spacy_model_name) spacy_model = spacy.load(spacy_model_name, disable=disable) LOADED_SPACY_MODELS[options] = spacy_model return LOADED_SPACY_MODELS[options]
def setup_model(model): try: nlp = spacy.load(model) except OSError: print(f"Spacy model '{model}' not found. Downloading and installing.") spacy_download(model) nlp = spacy.load(model) return nlp
def get_spacy_model(spacy_model_name='en_core_web_sm'): try: nlp = spacy.load(spacy_model_name) except OSError: log.info('The %s model was not found. Loading "en_core_web_sm"...') spacy_download('en_core_web_sm') nlp = spacy.load('en_core_web_sm') return nlp
def load_spacy(model_name): try: model = spacy.load(model_name) except OSError: print(f"Spacy models '{model_name}' not found. Downloading and installing.") spacy_download(model_name) model = spacy.load(model_name) return model
def setup_model(): global NLP global MODEL try: NLP = spacy.load(MODEL) except OSError: print( f"Spacy models '{MODEL}' not found. Downloading and installing.") spacy_download(MODEL) NLP = spacy.load(MODEL)
def download_models(): print('Downloading models...') subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"]) ["python", "-m", "spacy", "download", "en"] print('...done downloading.\nImporting and loading model.') import en_core_web_sm nlp = en_core_web_sm.load() print(nlp) spacy_download('en')
def __init__(self, model="en", disable=None, display_prompt=True): if disable is None: disable = [] try: self._parser = spacy.load(model, disable=disable) except OSError: url = "https://spacy.io/models" if display_prompt and license_prompt("Spacy {} model".format(model), url) is False: sys.exit(0) spacy_download(model) self._parser = spacy.load(model, disable=disable)
def __init__(self, model='en', disable=None): if disable is None: disable = [] try: self._parser = spacy.load(model, disable=disable) except OSError: url = 'https://spacy.io/models' if license_prompt('Spacy {} model'.format(model), url) is False: sys.exit(0) spacy_download(model) self._parser = spacy.load(model, disable=disable)
def spacy_downloader(spacy_model_name: str, pos_tags: bool, parse: bool, ner: bool) -> SpacyModelType: ''' This is a copy of allennlp.common.util.get_spacy_model function. This in affect downloads the relevant spacy model and loads the model with the relevant taggers e.g. POS, Parse and NER taggers for that spacy model which is language dependent. Spacy can have multiple trained models per language based on size. :param spacy_model_name: Name of the Spacy model e.g. en_core_web_sm :param pos_tags: Whether or not the returned Spacy model should perform POS tagging. :param parse: Whether or not the returned Spacy model should perform Parsing. :param ner: Whether or not the returned Spacy model should perform NER. :returns: The relevant Spacy model. ''' options = (spacy_model_name, pos_tags, parse, ner) if options not in LOADED_SPACY_MODELS: # This needs manually updating each time Spacy is updated. Supported # languages can be found here: https://spacy.io/usage/models supported_codes = [ 'de', 'el', 'en', 'es', 'fr', 'it', 'nl', 'pt', 'xx' ] lang_code = spacy_model_name[:2] if lang_code not in supported_codes: raise ValueError('Spacy does not support the following language ' f'{lang_code}. These languages are supported ' f'{supported_codes}') disable = ['vectors', 'textcat'] if not pos_tags: disable.append('tagger') if not parse: disable.append('parser') if not ner: disable.append('ner') try: spacy_model = spacy.load(spacy_model_name, disable=disable) except OSError: print(f"Spacy models '{spacy_model_name}' not found. " "Downloading and installing.") spacy_download(spacy_model_name) from spacy.cli import link from spacy.util import get_package_path package_path = get_package_path(spacy_model_name) link(spacy_model_name, spacy_model_name, model_path=package_path) spacy_model = spacy.load(spacy_model_name, disable=disable) LOADED_SPACY_MODELS[options] = spacy_model return LOADED_SPACY_MODELS[options]
def _prepare_tests(): import spacy from spacy.cli.download import download as spacy_download try: spacy.load('en') except OSError: spacy_download('en') from nlp_architect.api.machine_comprehension_api import MachineComprehensionApi from nlp_architect.api.intent_extraction_api import IntentExtractionApi from nlp_architect.api.ner_api import NerApi NerApi(prompt=False) IntentExtractionApi(prompt=False) MachineComprehensionApi(prompt=False).download_model()
def init(): global inference spacy_download('en') aspect_lex_path = Model.get_model_path('c_aspect_lex') opinion_lex_path = Model.get_model_path('c_opinion_lex') print("%------------------------------------------%") print("aspect_lex_path: ", Path(aspect_lex_path)) print("current wd: ", os.getcwd()) path = Path(aspect_lex_path) print("pathlib-exists()---->", path.exists()) print("Path :", path) print("Parent :", Path(aspect_lex_path).parent.parent.parent) print(os.listdir(Path(aspect_lex_path).parent.parent.parent)) print("%-----------------------------------------%") inference = SentimentInference(aspect_lex_path, opinion_lex_path)
def select_spacy_model(spacy_model_name: srt) -> SpacyModelType: """ This function checks if there is an instance from the Spacy Model specified. If there is, it returns the models. Otherwise, it loads the model. Loaded models are stored in LOADED_SPACY_MODELS """ if spacy_model_name not in LOADED_SPACY_MODELS: try: spacy_model = spacy.load(spacy_model_name, disable=["ner"]) except OSError: print( f"Spacy models '{spacy_model_name}' not found. Downloading and installing." ) spacy_download(spacy_model_name) spacy_model = spacy.load(spacy_model_name, disable=["ner"]) LOADED_SPACY_MODELS[spacy_model_name] = spacy_model return LOADED_SPACY_MODELS[spacy_model_name]
# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ****************************************************************************** import json import re from tqdm import tqdm from nltk import flatten from nlp_architect.models.absa.inference.inference import SentimentInference from spacy.cli.download import download as spacy_download import spacy from spacy.lang.en import English # load english language model spacy_download('en') # Construction via create_pipe nlp = English() sentencizer = nlp.create_pipe("sentencizer") nlp.add_pipe(sentencizer) # Custom func def word_freq(word_list): """ Return Polarity """ word_freq = [word_list.count(w) for w in word_list] return(dict(zip(word_list, word_freq)))
def __init__( self, mode='all', config_file='multiwoz_all_context.json', model_file='https://convlab.blob.core.windows.net/convlab-2/bert_multiwoz_all_context.zip' ): assert mode == 'usr' or mode == 'sys' or mode == 'all' self.mode = mode config_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'configs/{}'.format(config_file)) config = json.load(open(config_file)) # print(config['DEVICE']) # DEVICE = config['DEVICE'] DEVICE = 'cpu' if not torch.cuda.is_available() else 'cuda:0' root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) data_dir = os.path.join(root_dir, config['data_dir']) output_dir = os.path.join(root_dir, config['output_dir']) if not os.path.exists(os.path.join(data_dir, 'intent_vocab.json')): preprocess(mode) intent_vocab = json.load( open(os.path.join(data_dir, 'intent_vocab.json'))) tag_vocab = json.load(open(os.path.join(data_dir, 'tag_vocab.json'))) dataloader = Dataloader( intent_vocab=intent_vocab, tag_vocab=tag_vocab, pretrained_weights=config['model']['pretrained_weights']) print('intent num:', len(intent_vocab)) print('tag num:', len(tag_vocab)) best_model_path = os.path.join(output_dir, 'pytorch_model.bin') if not os.path.exists(best_model_path): if not os.path.exists(output_dir): os.makedirs(output_dir) print('Load from model_file param') archive_file = cached_path(model_file) archive = zipfile.ZipFile(archive_file, 'r') archive.extractall(root_dir) archive.close() print('Load from', best_model_path) model = JointBERT(config['model'], DEVICE, dataloader.tag_dim, dataloader.intent_dim) model.load_state_dict( torch.load(os.path.join(output_dir, 'pytorch_model.bin'), DEVICE)) model.to(DEVICE) model.eval() self.model = model self.use_context = config['model']['context'] self.dataloader = dataloader try: self.nlp = spacy.load("en_core_web_sm") except Exception: print('download en_core_web_sm for spacy') from spacy.cli.download import download as spacy_download spacy_download("en_core_web_sm") spacy_model_module = __import__("en_core_web_sm") self.nlp = spacy_model_module.load() with open( os.path.join(get_root_path(), 'data/multiwoz/db/postcode.json'), 'r') as f: token_list = json.load(f) for token in token_list: token = token.strip() self.nlp.tokenizer.add_special_case(token, [{ ORTH: token, LEMMA: token, POS: u'NOUN' }]) print("BERTNLU loaded")
def get_spacy_model( spacy_model_name: str, pos_tags: bool, parse: bool, ner: bool ) -> SpacyModelType: options = (spacy_model_name, pos_tags, parse, ner) if options not in LOADED_SPACY_MODELS: disable = ["vectors", "textcat"] if not pos_tags: if not parse: disable.append("parser") if not ner: disable.append("ner") try: except OSError: logger.warning( f"Spacy models '{spacy_model_name}' not found. Downloading and installing." ) spacy_download(spacy_model_name) spacy_model_module = __import__(spacy_model_name) LOADED_SPACY_MODELS[options] = spacy_model return LOADED_SPACY_MODELS[options] @contextmanager def pushd(new_dir: PathType, verbose: bool = False) -> ContextManagerFunctionReturnType[None]: if verbose: try: yield finally: if verbose: logger.info(f"Changing directory back to {previous_dir}") os.chdir(previous_dir) @contextmanager def push_python_path(path: PathType) -> ContextManagerFunctionReturnType[None]: path = Path(path).resolve() path = str(path) try: yield finally: def import_module_and_submodules(package_name: str) -> None: with push_python_path("."): path = getattr(module, "__path__", []) path_string = "" if not path else path[0] for module_finder, name, _ in pkgutil.walk_packages(path): if path_string and module_finder.path != path_string: continue subpackage = f"{package_name}.{name}" import_module_and_submodules(subpackage) def peak_memory_mb() -> Dict[int, float]: if resource is None or sys.platform not in ("linux", "darwin"): peak_mb = 0.0 else: if sys.platform == "darwin": peak_mb = peak / 1_000_000 else: peak_mb = peak / 1_000 if is_distributed(): gather_results = [torch.tensor([0.0, 0.0]) for _ in range(world_size)] if dist.get_backend() == "nccl": gather_results = [x.cuda() for x in gather_results] results_dict: Dict[int, float] = {} for peak_mb_tensor in gather_results: worker = int(peak_mb_tensor[0]) peak_mb = round(float(peak_mb_tensor[1]), 3) results_dict[worker] = peak_mb return results_dict else: return {0: peak_mb} def gpu_memory_mb() -> Dict[int, int]: try: result = subprocess.check_output( ["nvidia-smi", "--query-gpu=memory.used", "--format=csv,nounits,noheader"], encoding="utf-8", ) gpu_memory = [int(x) for x in result.strip().split("\n")] return {gpu: memory for gpu, memory in enumerate(gpu_memory)} except FileNotFoundError: return {} except: # noqa logger.warning( "unable to check gpu_memory_mb() due to occasional failure, continuing", exc_info=True ) return {} def ensure_list(iterable: Iterable[A]) -> List[A]: if isinstance(iterable, list): return iterable else: return list(iterable) def is_lazy(iterable: Iterable[A]) -> bool: return not isinstance(iterable, list) def int_to_device(device: Union[int, torch.device]) -> torch.device: if isinstance(device, torch.device): return device if device < 0: return torch.device(device) def log_frozen_and_tunable_parameter_names(model: torch.nn.Module) -> None: frozen_parameter_names, tunable_parameter_names = get_frozen_and_tunable_parameter_names(model) logger.info("The following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("The following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) def get_frozen_and_tunable_parameter_names( model: torch.nn.Module, ) -> Tuple[Iterable[str], Iterable[str]]: frozen_parameter_names = ( ) tunable_parameter_names = ( name for name, parameter in model.named_parameters() if parameter.requires_grad ) return frozen_parameter_names, tunable_parameter_names def dump_metrics(file_path: Optional[str], metrics: Dict[str, Any], log: bool = False) -> None: if file_path: with open(file_path, "w") as metrics_file: if log: logger.info("Metrics: %s", metrics_json) def flatten_filename(file_path: str) -> str: def is_master( global_rank: int = None, world_size: int = None, num_procs_per_node: int = None ) -> bool: if not is_distributed(): return True if global_rank is None: global_rank = dist.get_rank() if world_size is None: world_size = dist.get_world_size() if num_procs_per_node is None and os.environ: num_procs_per_node = int(os.environ.get("ALLENNLP_PROCS_PER_NODE", world_size)) return global_rank % (world_size / num_procs_per_node) == 0 def is_distributed() -> bool: def sanitize_wordpiece(wordpiece: str) -> str: return wordpiece[2:] elif wordpiece.startswith("Ġ"): return wordpiece[1:] elif wordpiece.startswith("▁"):
import math import logging import pandas as pd import numpy as np from scipy.sparse import csr_matrix import nltk from nltk.tokenize import sent_tokenize from spacy.cli.download import download as spacy_download try: import en_core_web_sm except ImportError: logging.warning(">Spacy en_core_web_sm not found. Downloading and installing.") spacy_download("en_core_web_sm") import en_core_web_sm from collections import Counter, defaultdict, OrderedDict import time import os from enum import Enum, auto class ParseAndModel: """ Treats data input chain. Based on this data, computes matrices for reviews and features. Usage: pm = ParseAndModel(feature_list=["sound", "battery", ["screen", "display"]], filename='../tests/data/parse_and_model/iPod.final') print(pm.model_results) """ class InputType(Enum):
import nltk import spacy from nltk.tokenize import word_tokenize try: # se não tiver na máquina faz o download import pt_core_news_sm # noqa except: # noqa from spacy.cli.download import download as spacy_download spacy_download("pt_core_news_sm") nltk.download("punkt") sp = spacy.load("pt_core_news_sm") def remove_portuguese_stopwords(text, custom_stopwords=None): text = text.lower() all_stopwords = sp.Defaults.stop_words abc = [char for char in "abcdefghijklmnopqrstuvxyzw"] if not custom_stopwords: custom_stopwords = [] aditional_stopwords = list(all_stopwords) + abc + custom_stopwords text_tokens = word_tokenize(text) return " ".join( [word for word in text_tokens if word not in aditional_stopwords])
def default_nlp_model(): # pragma: no cover spacy_download('en_core_web_sm') nlp = spacy.load('en_core_web_sm') return nlp