def __init__(self, nlp=None, greedyness=0.5, max_dist=50, max_dist_match=500, conll=None, use_no_coref_list=True, debug=False): self.greedyness = greedyness self.max_dist = max_dist self.max_dist_match = max_dist_match self.debug = debug if nlp is None: print("Loading spacy model") try: spacy.info('en_core_web_sm') model = 'en_core_web_sm' except IOError: print("No spacy 2 model detected, using spacy1 'en' model") model = 'en' nlp = spacy.load(model) model_path = os.path.join(PACKAGE_DIRECTORY, "weights/conll/" if conll is not None else "weights/") embed_model_path = os.path.join(PACKAGE_DIRECTORY, "weights/") print("loading model from", model_path) self.data = Data(nlp, model_path=embed_model_path, conll=conll, use_no_coref_list=use_no_coref_list, consider_speakers=conll) self.coref_model = Model(model_path) self.clusters = {} self.mention_to_cluster = [] self.mentions_single_scores = {} self.mentions_single_features = {} self.mentions_pairs_scores = {} self.mentions_pairs_features = {}
def __init__(self, nlp=None, coref=None): if nlp is None: print("Loading spacy model") try: spacy.info('en_core_web_sm') model = 'en_core_web_sm' except IOError: print("No spacy 2 model detected, using spacy1 'en' model") model = 'en' self.nlp = spacy.load(model) if not coref: try: from neuralcoref_lib.neuralcoref import Coref self.coref = Coref(nlp=self.nlp) except: self.coref = coref logging.info('Coreference not available') else: self.coref = coref self.spacy_tags = { 'pos': { 'noun': ['NOUN', 'PROPN'], 'pronoun': ['PRON'] }, 'dep': { 'subject': ['csubj', 'csubjpass', 'nsubj', 'nsubjpass'], #'agent','expl', 'object': ['dobj', 'iobj', 'oprd'] } } # ADD 'attr' for spacy 2 self.conversational_pronouns = ["i", "you"] self.grammatical_ranking = ['S', 'O', 'X']
def __init__(self, nlp=None,greedyness=0.5, max_dist=50, max_dist_match=500, conll=None, blacklist=True, debug=False): self.greedyness = greedyness self.max_dist = max_dist self.max_dist_match = max_dist_match self.debug = debug embed_path = 'weights/' if embed_path is not None: self.embed_extractor = EmbeddingExtractor(embed_path) #model_path = os.path.join(PACKAGE_DIRECTORY, "weights/conll/" if conll is not None else "weights/") #model_path = os.path.join(PACKAGE_DIRECTORY, "weights/") model_path = "checkpoints/" print("Loading neuralcoref model from", model_path) self.coref_model = CModel(model_path) if nlp is None: print("Loading spacy model") try: spacy.info('en_core_web_sm') model = 'en_core_web_sm' except IOError: print("No spacy 2 model detected, using spacy1 'en' model") spacy.info('en') model = 'en' nlp = spacy.load(model) self.data = Document(nlp, conll=conll, blacklist=blacklist, model_path='weights/') self.clusters = {} self.mention_to_cluster = [] self.mentions_single_scores = {} self.mentions_pairs_scores = {}
def mention_detection_debug(sentence): print("🌋 Loading spacy model") try: spacy.info("en_core_web_sm") model = "en_core_web_sm" except IOError: print("No spacy 2 model detected, using spacy1 'en' model") spacy.info("en") model = "en" nlp = spacy.load(model) doc = nlp(sentence.decode("utf-8")) mentions = extract_mentions_spans(doc, blacklist={}, debug=True) for mention in mentions: print(mention)
def mention_detection_debug(sentence): print(u"🌋 Loading spacy model") try: spacy.info('en_core_web_sm') model = 'en_core_web_sm' except IOError: print("No spacy 2 model detected, using spacy1 'en' model") spacy.info('en') model = 'en' nlp = spacy.load(model) doc = nlp(sentence.decode('utf-8')) mentions = extract_mentions_spans(doc, blacklist=False, debug=True) for mention in mentions: print(mention)
def list_linked_spacy_models(): """ Read SPACY/data and return a list of link_name """ spacy_data = os.path.join(spacy.info(silent=True)['Location'], 'data') linked = [d for d in os.listdir(spacy_data) if os.path.islink(os.path.join(spacy_data, d))] # linked = [os.path.join(spacy_data, d) for d in os.listdir(spacy_data)] # linked = {os.readlink(d): os.path.basename(d) for d in linked if os.path.islink(d)} return linked
def get_nlp(language): """Get NLP using spacy.""" import spacy # pylint: disable=import-outside-toplevel if language not in spacy.info()["Models"]: spacy.cli.download(language) return spacy.load(language)
def __init__(self): logger.info("Loading NLP model: spaCy en_core_web_lg") self.nlp = { "en": spacy.load("en_core_web_lg", disable=['parser', 'tagger']) } logger.info("Printing spaCy model and package details:" "\n\n {}\n\n".format(spacy.info("en_core_web_lg")))
def test_download(self): # Download model beforehand model_path = download_model('spacy', DEFAULT_CACHE_DIR, process_func=_unzip_process_func, verbose=True) info = spacy.info(model_path) self.assertListEqual(info['pipeline'], ['tagger', 'parser', 'ner']) self.assertEqual(info['lang'], 'da')
def __init__(self, models=None): if not models: models = {"en": "en_core_web_lg"} logger.debug(f"Loading SpaCy models: {models.values()}") self.nlp = { lang_code: spacy.load(model_name, disable=['parser', 'tagger']) for lang_code, model_name in models.items() } for model_name in models.values(): logger.debug("Printing spaCy model and package details:" "\n\n {}\n\n".format(spacy.info(model_name)))
def __init__( self, nlp=None, greedyness=0.5, max_dist=50, max_dist_match=500, conll=None, blacklist=True, debug=False, ): self.greedyness = greedyness self.max_dist = max_dist self.max_dist_match = max_dist_match self.debug = debug model_path = os.path.join( PACKAGE_DIRECTORY, "weights/conll/" if conll is not None else "weights/") model_path = os.path.join(PACKAGE_DIRECTORY, "weights/") print("Loading neuralcoref model from", model_path) self.coref_model = Model(model_path) if nlp is None: print("Loading spacy model") try: spacy.info("en_core_web_sm") model = "en_core_web_sm" except IOError: print("No spacy 2 model detected, using spacy1 'en' model") spacy.info("en") model = "en" nlp = spacy.load(model) self.data = Document(nlp, conll=conll, blacklist=blacklist, model_path=model_path) self.clusters = {} self.mention_to_cluster = [] self.mentions_single_scores = {} self.mentions_pairs_scores = {}
def check_spacy_model(model) -> bool: spacy_info = spacy.info() models = list(spacy_info.get('pipelines', spacy_info.get('models', None)).keys()) if models is None: raise ValueError('Unable to detect spacy models.') if model not in models: msg.info("Downloading spacy model {}".format(model)) spacy.cli.download(model) # spacy.info() doesnt update after a spacy.cli.download, so theres no point checking it models.append(model) # Always returns true, if it fails to download, spacy sys.exit()s return model in models
def str2spacy(model): if int(spacy.__version__.split('.')[0]) < 3: downloaded_models = [os.path.basename(m) for m in list_downloaded_spacy_models()] links = list_linked_spacy_models() else: # As of spacy v3, links do not exist anymore and it is simpler to get a list of # downloaded models downloaded_models = list(spacy.info()['pipelines']) links = [] filtered_downloaded = [m for m in downloaded_models if m[:2] == model] if model in downloaded_models + links: # Check whether `model` is the name of a model/link return model elif filtered_downloaded: # Check whether `model` is a lang code and corresponds to a downloaded model return filtered_downloaded[0] else: # Return asked model to have an informative error. return model
def mention_detection_debug(sentence, lang="en"): print(u"🌋 Loading spacy model") if lang == "en": try: spacy.info('en_core_web_sm') model = 'en_core_web_sm' except IOError: print("No spacy 2 model detected, using spacy1 'en' model") spacy.info('en') model = 'en' elif lang == "de": try: spacy.info("de_core_news_sm") model = "de_core_news_sm" except IOError as e: print("german model not there") nlp = spacy.load(model) doc = nlp(sentence) mentions = extract_mentions_spans(doc, blacklist=False, debug=True) print("==-=-= Extracted mentions =-=-==") for mention in mentions: print(mention)
'pt': 'pt_core_news_sm', 'it': 'it_core_news_sm', 'nl': 'nl_core_news_sm', 'xx': 'xx_core_news_sm' } from flask import Flask, request, jsonify import spacy, re, os, sys if not (sys.argv[1]): print( "You must give the iso_code of the language model to launch (fr,en,de,es,pt,it,nl,xx). Exiting." ) exit() else: # get spacy info spacy.info() # get path of models #path = spacy.util.get_data_path() #print("Path to models : ",path) # now load spacy model for the language lang = sys.argv[1] # CHANGE ACCORDING TO THE LANGUAGE YOU WORK ON ! try: nlp = spacy.load(lang_models[lang]) app = Flask(__name__) print('Model : ' + lang + " loaded!") except Exception as e: print("Bad iso code or other error : ", str(e)) exit() @app.route("/check")
def load_language_resource(descriptor): print(spacy.info(vector_descriptor)) nlp = spacy.load(vector_descriptor) return nlp
def read_corpus(self, data_path, debug=False): print("🌋 Reading files") for dirpath, _, filenames in os.walk(data_path): print("In", dirpath, os.path.abspath(dirpath)) file_list = [os.path.join(dirpath, f) for f in filenames if f.endswith(".v4_auto_conll") \ or f.endswith(".v4_gold_conll")] cleaned_file_list = [] for f in file_list: fn = f.split('.') if fn[1] == "v4_auto_conll": gold = fn[0] + "." + "v4_gold_conll" if gold not in file_list: cleaned_file_list.append(f) else: cleaned_file_list.append(f) doc_list = parallel_process(cleaned_file_list, load_file) for docs in doc_list: #executor.map(self.load_file, cleaned_file_list): for utts_text, utt_tokens, utts_corefs, utts_speakers, name, part in docs: print("Imported", name) if debug: print("utts_text", utts_text) print("utt_tokens", utt_tokens) print("utts_corefs", utts_corefs) print("utts_speakers", utts_speakers) print("name, part", name, part) self.utts_text += utts_text self.utts_tokens += utt_tokens self.utts_corefs += utts_corefs self.utts_speakers += utts_speakers self.utts_doc_idx += [len(self.docs_names) ] * len(utts_text) self.docs_names.append((name, part)) print("utts_text size", len(self.utts_text)) print("utts_tokens size", len(self.utts_tokens)) print("utts_corefs size", len(self.utts_corefs)) print("utts_speakers size", len(self.utts_speakers)) print("utts_doc_idx size", len(self.utts_doc_idx)) print("🌋 Building docs") for name, part in self.docs_names: self.docs.append( ConllDoc(name=name, part=part, nlp=None, blacklist=False, consider_speakers=True, embedding_extractor=self.embed_extractor, conll=CONLL_GENRES[name[:2]])) print("🌋 Loading spacy model") try: spacy.info('en_core_web_sm') model = 'en_core_web_sm' except IOError: print("No spacy 2 model detected, using spacy1 'en' model") spacy.info('en') model = 'en' nlp = spacy.load(model) print("🌋 Parsing utterances and filling docs") doc_iter = (s for s in self.utts_text) for utt_tuple in tqdm( zip(nlp.pipe(doc_iter), self.utts_tokens, self.utts_corefs, self.utts_speakers, self.utts_doc_idx)): spacy_tokens, conll_tokens, corefs, speaker, doc_id = utt_tuple if debug: print(unicode_(self.docs_names[doc_id]), "-", spacy_tokens) doc = spacy_tokens if debug: out_str = "utterance " + unicode_(doc) + " corefs " + unicode_(corefs) + \ " speaker " + unicode_(speaker) + "doc_id" + unicode_(doc_id) print(out_str.encode('utf-8')) self.docs[doc_id].add_conll_utterance( doc, conll_tokens, corefs, speaker, use_gold_mentions=self.use_gold_mentions)
def read_corpus(self, data_path, debug=False): # this function holds the key to constructing the memory module for the conll corpus # find the discourse end marker, that holds the key to forming stories for memory inference print("🌋 Reading files") #read_corpus_input = input() dir_walk_count = 0 for dirpath, _, filenames in os.walk(data_path): #dir_walk_count += 1 #if dir_walk_count > 5 : # break print("In", dirpath, os.path.abspath(dirpath)) file_list = [os.path.join(dirpath, f) for f in filenames if f.endswith(".v4_auto_conll") \ or f.endswith(".v4_gold_conll")] cleaned_file_list = [] for f in file_list: fn = f.split('.') if fn[1] == "v4_auto_conll": gold = fn[0] + "." + "v4_gold_conll" if gold not in file_list: cleaned_file_list.append(f) else: cleaned_file_list.append(f) #doc_list = parallel_process(cleaned_file_list, load_file) # what is the doc list ? for file in cleaned_file_list:#executor.map(self.load_file, cleaned_file_list): docs = load_file(file) for utts_text, utt_tokens, utts_corefs, utts_speakers, name, part in docs: print("Imported", name) if debug: print("utts_text", utts_text) print("utt_tokens", utt_tokens) print("utts_corefs", utts_corefs) print("utts_speakers", utts_speakers) print("name, part", name, part) #utterances_in_doc_input = input() self.utts_text += utts_text self.utts_tokens += utt_tokens self.utts_corefs += utts_corefs self.utts_speakers += utts_speakers self.utts_doc_idx += [len(self.docs_names)] * len(utts_text) self.docs_names.append((name, part)) print("utts_text size", len(self.utts_text)) print("utts_tokens size", len(self.utts_tokens)) print("utts_corefs size", len(self.utts_corefs)) print("utts_speakers size", len(self.utts_speakers)) print("utts_doc_idx size", len(self.utts_doc_idx)) print("🌋 Building docs") for name, part in self.docs_names: self.docs.append(ConllDoc(name=name, part=part, nlp=None, blacklist=False, consider_speakers=True, embedding_extractor=self.embed_extractor, conll=CONLL_GENRES[name[:2]])) print("🌋 Loading spacy model") try: spacy.info('en_core_web_sm') model = 'en_core_web_sm' except IOError: print("No spacy 2 model detected, using spacy1 'en' model") spacy.info('en') model = 'en' nlp = spacy.load(model) print("🌋 Parsing utterances and filling docs") doc_iter = (s for s in self.utts_text) for utt_tuple in tqdm(zip(nlp.pipe(doc_iter), self.utts_tokens, self.utts_corefs, self.utts_speakers, self.utts_doc_idx)): spacy_tokens, conll_tokens, corefs, speaker, doc_id = utt_tuple if debug: print(unicode_(self.docs_names[doc_id]), "-", spacy_tokens) #spacy_tokens_in_doc = input() doc = spacy_tokens if debug: out_str = "utterance " + unicode_(doc) + " corefs " + unicode_(corefs) + \ " speaker " + unicode_(speaker) + "doc_id" + unicode_(doc_id) print(out_str.encode('utf-8')) print("CONLL TOKENS HERE ARE") print(Doc(nlp.vocab,conll_tokens)) print("SENTENCE EMBEDDING OF TOKENS ARE") #print(self.embed_extractor.get_average_embedding(conll_tokens)) #out_str_input2 = input() self.docs[doc_id].add_conll_utterance(doc, conll_tokens, corefs, speaker, use_gold_mentions=self.use_gold_mentions) del spacy_tokens, conll_tokens, corefs,speaker, doc_id del nlp, doc_iter
def read_corpus(self, data_path, model=None, debug=False): print("🌋 Reading files") for dirpath, _, filenames in os.walk(data_path): print("In", dirpath, os.path.abspath(dirpath)) file_list = [ os.path.join(dirpath, f) for f in filenames if f.endswith(".v4_auto_conll") or f.endswith(".v4_gold_conll") ] cleaned_file_list = [] for f in file_list: fn = f.split(".") if fn[1] == "v4_auto_conll": gold = fn[0] + "." + "v4_gold_conll" if gold not in file_list: cleaned_file_list.append(f) else: cleaned_file_list.append(f) doc_list = parallel_process(cleaned_file_list, load_file) for docs in doc_list: # executor.map(self.load_file, cleaned_file_list): for ( utts_text, utt_tokens, utts_corefs, utts_speakers, name, part, ) in docs: if debug: print("Imported", name) print("utts_text", utts_text) print("utt_tokens", utt_tokens) print("utts_corefs", utts_corefs) print("utts_speakers", utts_speakers) print("name, part", name, part) self.utts_text += utts_text self.utts_tokens += utt_tokens self.utts_corefs += utts_corefs self.utts_speakers += utts_speakers self.utts_doc_idx += [len(self.docs_names) ] * len(utts_text) self.docs_names.append((name, part)) print("utts_text size", len(self.utts_text)) print("utts_tokens size", len(self.utts_tokens)) print("utts_corefs size", len(self.utts_corefs)) print("utts_speakers size", len(self.utts_speakers)) print("utts_doc_idx size", len(self.utts_doc_idx)) print("🌋 Building docs") for name, part in self.docs_names: self.docs.append( ConllDoc( name=name, part=part, nlp=None, blacklist=self.blacklist, consider_speakers=True, embedding_extractor=self.embed_extractor, conll=CONLL_GENRES[name[:2]], )) print("🌋 Loading spacy model") if model is None: model_options = [ "en_core_web_lg", "en_core_web_md", "en_core_web_sm", "en" ] for model_option in model_options: if not model: try: spacy.info(model_option) model = model_option print("Loading model", model_option) except: print("Could not detect model", model_option) if not model: print("Could not detect any suitable English model") return else: spacy.info(model) print("Loading model", model) nlp = spacy.load(model) print("🌋 Parsing utterances and filling docs with use_gold_mentions=" + (str(bool(self.gold_mentions)))) doc_iter = (s for s in self.utts_text) for utt_tuple in tqdm( zip( nlp.pipe(doc_iter), self.utts_tokens, self.utts_corefs, self.utts_speakers, self.utts_doc_idx, )): spacy_tokens, conll_tokens, corefs, speaker, doc_id = utt_tuple if debug: print(unicode_(self.docs_names[doc_id]), "-", spacy_tokens) doc = spacy_tokens if debug: out_str = ("utterance " + unicode_(doc) + " corefs " + unicode_(corefs) + " speaker " + unicode_(speaker) + "doc_id" + unicode_(doc_id)) print(out_str.encode("utf-8")) self.docs[doc_id].add_conll_utterance( doc, conll_tokens, corefs, speaker, use_gold_mentions=self.gold_mentions)
import spacy spacy.info() # Muestra los metadatos de los modelos. # Leer el modelo: # nlp = spacy.load('en') # No funciona en MacOS a menos que antes se corra el siguiente comando con sudo: # $sudo python -m spacy link MODELO es # siendo MODELO 'es_core_news_md' o 'es_core_news_sm' (el que se haya descargado). nlp = spacy.load( '/Users/lino/anaconda3/lib/python3.6/site-packages/es_core_news_sm/es_core_news_sm-2.0.0' ) # Cargar un texto doc = nlp( u'Apenas faltan 24 horas para que Tim Cook, consejero delegado de Apple, vuelva al Steve Jobs Theatre, la única zona de su nuevo campus, junto con el centro de visitantes donde pueden acceder los no empleados de la manzana.' ) # Tokenización: tokens = [t.text for t in doc] tokens_palabras = [t.orth_ for t in doc if not t.is_punct] # Tokens sin signos de puntuación tokens_enteros = [token.orth for token in doc ] # Usa una representación NUMERICA de los tokens tokens_lexicos = [t.orth_ for t in doc if not t.is_punct | t.is_stop ] # Tokens sin puntuación ni stopwords from spacy.symbols import ORTH, LEMMA, POS, TAG
@Created : 5/31/18 2:02 PM @Desc : !pip install spacy -i https://pypi.mirrors.ustc.edu.cn/simple !python -m spacy download en_core_web_md """ from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor from time import time from multiprocessing import Pool from multiprocessing.dummy import Pool as ThreadPool from functools import reduce import spacy from joblib import Parallel, delayed spacy.info() spacy.info('en_core_web_md') # /home/deco/miniconda2/envs/tf17/lib/python3.6/site-packages/en_core_web_md def load_simple(): # import en_core_web_md # nlp = en_core_web_sm.load() nlp = spacy.load('en_core_web_md') # nlp is a Language instance print('pipeline:', nlp.pipeline) print('pipe_names:', nlp.pipe_names) doc = nlp('Apple is looking at buying U.K. startup for $1 billion') # callable instance returning Doc instance print('tokens:')
import spacy spacy.info('en') #spacy.info('en', markdown=True) nlp = spacy.load('en') doc = nlp(u"This is a sentence.") print(doc[0].text) print(doc[1].text) print(doc[-1].text) print(doc[2:3].text) print([(w.text, w.pos_) for w in doc]) print('*******') nlp = spacy.load('en_core_web_sm') doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion') for token in doc: print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop) print('*******') for ent in doc.ents: print(ent.text, ent.start_char, ent.end_char, ent.label_) print('*******') doc = nlp(u'I love coffee') for word in doc: lexeme = doc.vocab[word.text] print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_, lexeme.is_alpha, lexeme.is_digit, lexeme.is_title, lexeme.lang_)
for i in range(10): js.append(json.loads(f.readline())) f.close() review_df = pd.DataFrame(js) review_df.shape # ### Using spacy: [Installation instructions for spacy](https://spacy.io/docs/usage/) # In[28]: import spacy # In[29]: # model meta data spacy.info('en_core_web_sm') # In[30]: # preload the language model nlp = spacy.load('en_core_web_sm') # In[31]: review_df['text'][:2] # In[32]: # Keeping it in a pandas dataframe doc_df = review_df['text'].apply(nlp)